Newer
Older
###########################################################################################
# Converts mediawiki pages to rst pages for sphinx.
# run with -h for command line arguments
#
# sample command C:\MantidInstall\bin\python.exe C:/Mantid/tools/scripts/ConvertToRST/ConvertWikiPage.py
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
# -o C:/Mantid/Code/Mantid/docs/source/training/MBC_Displaying_data_Formatting.rst
#
# pandoc must be installed an in the path
#
# extends pandoc by downloading and correcting image links, and adding links for algorithms,
# fit functions and common concepts (workspace types)
#
# Limitations:
# 1. in adding text to links or images in tables the table formatting will be disrupted
# 2. pandoc creates some images in an inline format, these cannot have the align tags added
# back on, this is marked with a comment, the solution is probably to move the image from
# the inline to normal format.
# 3. Image links cannot have spaces in the filename in rst.
# The spaces are removed in the downloaded file names, but not the links in the rst files.
#
##########################################################################################
import os
import re
import sys
import urllib2
import urlparse
import argparse
import subprocess
import mantid
def readWebPage(url):
# set your environment HTTP_PROXY to be your proxy
# for ral HTTP_PROXY=http://wwwcache.rl.ac.uk:8080
aResp = urllib2.urlopen(url)
web_pg = aResp.read()
return web_pg
def downloadImage(imgUrl, filePath):
downloadedImage = file(filePath, "wb")
imageOnWeb = urllib2.urlopen(imgUrl)
while True:
buf = imageOnWeb.read(65536)
if len(buf) == 0:
break
downloadedImage.write(buf)
downloadedImage.close()
imageOnWeb.close()
return filePath
def convertURLToRaw(url):
return url + "?action=raw"
def covertMediaWikiToRST(url):
cmd = 'pandoc -f mediawiki -t rst "' + url + '"'
print cmd
return runProcess(cmd)
def runProcess(cmd):
# instantiate a startupinfo obj:
startupinfo = subprocess.STARTUPINFO()
# set the use show window flag, might make conditional on being in Windows:
startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
startupinfo=startupinfo)
output, error = p.communicate()
output = output.replace("\r\n","\n")
print error
return output
def processImageLinks(mediawikiText,rstText, imagePath, relImagePath):
retRstText = rstText
mwImagePattern = re.compile(r'\[\[(Image|File):(.+?)(\|.*?)?(\|.*?)?(\|.*?)?(\|.*?)?(\|.*?)?(\|.*?)?\]\]',
re.IGNORECASE)
rstImagePattern = re.compile(r'figure:: (([\w\-\_\\\/]+)\.(\w{2,5}))(.{0,50}\3){2}',
re.DOTALL)
rstSubstitutionImagePattern = re.compile(r'(figure|image):: (([\w\-\_\\\/]+)\.(\w{2,5}))')
imgLinkDict ={}
#for all of the mediawiki links
for m in re.finditer(mwImagePattern, mediawikiText):
print ("processing image link",m.group(0))
rstLink = generateRstImageLink(m,relImagePath)
imgLinkDict[m.group(2)] = rstLink
print (rstLink)
#for all of the rst figure links
replacements = []
for m in re.finditer(rstImagePattern, retRstText):
rstLink = imgLinkDict[m.group(1)]
replacements.append((m.start(), m.end(), m.group(1), rstLink))
#perform replacements in reverse order
for (start,end,imagefile,rstLink) in reversed(replacements):
retRstText = retRstText[0:start] + rstLink + retRstText[end:]
replacements = []
for m in re.finditer(rstSubstitutionImagePattern, retRstText):
rstLink = imgLinkDict[m.group(2)]
rstLink = cleanInlineImagesDefinition(rstLink)
replacements.append((m.start(), m.end(), m.group(2), rstLink))
#perform replacements in reverse order
for (start,end,imagefile,rstLink) in reversed(replacements):
retRstText = retRstText[0:start] + rstLink + retRstText[end:]
#get all of the image files
for imageName in imgLinkDict.keys():
saveImageFromMW(imageName,imagePath)
return retRstText
def saveImageFromMW(imageName,path):
url = "http://www.mantidproject.org/File:"+imageName.replace(" ","_")
print "Downloading image: ", url
imagePage = readWebPage(url)
mwImagePattern = re.compile(r'<div class="fullImageLink" id="file"><a href="([\/\w\.\-]+)">')
imagePath = path + "/" + imageName.replace(" ","")
match = re.search(mwImagePattern,imagePage)
if match is not None:
imageURL = match.group(1)
imageURL = "http://www.mantidproject.org" + imageURL
if not os.path.exists(imagePath):
print "saving ", imageName, "to", imagePath
downloadImage(imageURL,imagePath)
def generateRstImageLink(match,relImagePath):
link = "image:: " + relImagePath+ "/" + match.group(2) + "\n"
for i in range(3,len(match.groups())):
if match.group(i) is None:
break
#strip off the first character as it is the | pipe
imageOption = addImageOption(match.group(i)[1:])
if imageOption is not None:
link += "\t\t\t" + imageOption + "\n"
return link
def addImageOption(mwImageOption):
imageOption = mwImageOption.strip()
if len(imageOption)>0:
if imageOption.endswith("px"):
return ":width: " + imageOption
elif imageOption in ["right","left","middle","centre"]:
return ":align: " + imageOption
else:
return ":alt: " + imageOption
def cleanInlineImagesDefinition(rstLink):
match = re.search(r'^\s+:align:\s+\w+\s*$',rstLink,re.MULTILINE)
if match is not None:
#take the align out
rstLink = rstLink[0:match.start()] + rstLink[match.end()+1:]
#then add it at the end as a comment
rstLink += ".. FIXME (inline definition does not allow align)" + match.group(0)
return rstLink
def ensureDirectoriesExist(path):
try:
os.makedirs(path)
except OSError:
pass
def addLocalLinks(rstText,list,prefix):
retRstText = rstText
#build regex string for simple words
regex = r"[^`<]((\*{0,2})(\b" + r"\b|\b".join(list) + r"\b)\2)[^`_]"
pattern = re.compile(regex)
replacements = []
for m in re.finditer(pattern, retRstText):
rstLink = ":ref:`" + m.group(3) + " <" + prefix + m.group(3) + ">`"
print ("processing new link",m.group(1), rstLink)
replacements.append((m.start(1), m.end(1), m.group(1), rstLink))
#perform replacements in reverse order
for (start,end,item,rstLink) in reversed(replacements):
retRstText = retRstText[0:start] + rstLink + retRstText[end:]
#build regex string for links
regexLink = r"`(.+?)<(\b" + r"\b|\b".join(list) + r"\b)>`__"
patternLink = re.compile(regexLink)
replacements = []
for m in re.finditer(patternLink, retRstText):
rstLink = ":ref:`" + m.group(1) + " <" + prefix + m.group(2) + ">`"
print ("processing existing link",m.group(0), rstLink)
replacements.append((m.start(), m.end(), m.group(0), rstLink))
#perform replacements in reverse order
for (start,end,item,rstLink) in reversed(replacements):
retRstText = retRstText[0:start] + rstLink + retRstText[end:]
return retRstText
def fixUnderscoresInRefLinks(rstText):
retRstText = rstText
retRstText = re.sub(r"\b\\\_","_",retRstText)
return retRstText
################################################################################################################
parser = argparse.ArgumentParser(description='Converts mediawiki pages to rst pages for sphinx.')
parser.add_argument('inputURL', help='the url of a mediawiki page')
parser.add_argument('-o', '--o',
help='Provide a path to output to a file')
parser.add_argument('-i', '--i',
help='Provide a relative path from the file to the images directory')
parser.add_argument('-rp', '--rp',
help='Provide a reference link prefix')
parser.add_argument('-r', '--r',
help='Provide a reference link')
args = parser.parse_args()
print args.inputURL
print args.o
urlSegments = urlparse.urlparse(args.inputURL)
pageName = urlSegments.path[1:]
print "Parsing ", pageName
#run pandoc and get the output in rst
mediawikiText = readWebPage(convertURLToRaw(args.inputURL))
#run pandoc and get the output in rst
rstText = covertMediaWikiToRST(convertURLToRaw(args.inputURL))
#print "*****************"
#print rstText
#print "*****************"
relPath = "../images"
if args.i is not None:
relPath = os.path.dirname(args.i)
outDir = os.curdir
if args.o is not None:
outDir = os.path.dirname(args.o)
imagePath = os.path.join(outDir,relPath)
imagePath = os.path.abspath(imagePath)
print imagePath, outDir
ensureDirectoriesExist(imagePath)
ensureDirectoriesExist(outDir)
#perform any processing beyond pandoc
rstText = fixUnderscoresInRefLinks(rstText)
rstText = processImageLinks(mediawikiText,rstText, imagePath, relPath)
wsList = ['EventWorkspace','MatrixWorkspace','PeaksWorkspace','MDWorkspace','Table Workspaces','WorkspaceGroup',
'RectangularDetector','RAW File','Facilities File','FitConstraint','Framework Manager',
'Instrument Data Service','InstrumentDefinitionFile','InstrumentParameterFile','Properties File',
'MDHistoWorkspace','MDNorm','Nexus file','PeaksWorkspace','Point and space groups','RAW File'
'Shared Pointer','Symmetry groups','Unit Factory','UserAlgorithms','Workflow Algorithm',
'Workspace','Workspace2D','WorkspaceGroup']
rstText = addLocalLinks(rstText,wsList,"")
algList = mantid.AlgorithmFactory.getRegisteredAlgorithms(False).keys()
rstText = addLocalLinks(rstText,algList,"algm-")
funcList = mantid.FunctionFactory.getFunctionNames()
rstText = addLocalLinks(rstText,funcList,"func-")
#add reference name
refPrefix = "train-"
if args.rp is not None:
refPrefix = args.rp
refName = pageName
if args.r is not None:
refName = args.r
rstText = ".. _" + refPrefix + refName + ":\n\n" + rstText
#save output
if args.o is not None:
sys.stdout = open(args.o, 'w')
print rstText