ConvertWikiPage.py

###########################################################################################
# Converts mediawiki pages to rst pages for sphinx.
# run with -h for command line arguments
#
# sample command C:\MantidInstall\bin\python.exe C:/Mantid/tools/scripts/ConvertToRST/ConvertWikiPage.py
#                       -o C:/Mantid/Code/Mantid/docs/source/training/MBC_Displaying_data_Formatting.rst
#
# pandoc must be installed an in the path
#
# extends pandoc by downloading and correcting image links, and adding links for algorithms,
# fit functions and common concepts (workspace types)
#
# Limitations:
# 1. in adding text to links or images in tables the table formatting will be disrupted
# 2. pandoc creates some images in an inline format, these cannot have the align tags added
#    back on, this is marked with a comment, the solution is probably to move the image from
#    the inline to normal format.
# 3. Image links cannot have spaces in the filename in rst.
#    The spaces are removed in the downloaded file names, but not the links in the rst files.
#
##########################################################################################
import os
import re
import sys
import urllib2
import urlparse
import argparse
import subprocess
import mantid

def readWebPage(url):
    # set your environment HTTP_PROXY to be your proxy
    # for ral HTTP_PROXY=http://wwwcache.rl.ac.uk:8080
    aResp = urllib2.urlopen(url)
    web_pg = aResp.read()
    return web_pg

def downloadImage(imgUrl, filePath):
    downloadedImage = file(filePath, "wb")

    imageOnWeb = urllib2.urlopen(imgUrl)
    while True:
        buf = imageOnWeb.read(65536)
        if len(buf) == 0:
            break
        downloadedImage.write(buf)
    downloadedImage.close()
    imageOnWeb.close()

    return filePath

def convertURLToRaw(url):
    return url + "?action=raw"

def covertMediaWikiToRST(url):
    cmd = 'pandoc -f mediawiki -t rst "' + url + '"'
    print cmd

    return runProcess(cmd)

def runProcess(cmd):
    # instantiate a startupinfo obj:
    startupinfo = subprocess.STARTUPINFO()
    # set the use show window flag, might make conditional on being in Windows:
    startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW

    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
                     startupinfo=startupinfo)
    output, error = p.communicate()
    output = output.replace("\r\n","\n")
    print error
    return output

def processImageLinks(mediawikiText,rstText, imagePath, relImagePath):
    retRstText = rstText
    mwImagePattern = re.compile(r'\[\[(Image|File):(.+?)(\|.*?)?(\|.*?)?(\|.*?)?(\|.*?)?(\|.*?)?(\|.*?)?\]\]',
                                re.IGNORECASE)
    rstImagePattern = re.compile(r'figure:: (([\w\-\_\\\/]+)\.(\w{2,5}))(.{0,50}\3){2}',
                            re.DOTALL)
    rstSubstitutionImagePattern = re.compile(r'(figure|image):: (([\w\-\_\\\/]+)\.(\w{2,5}))')


    imgLinkDict ={}
    #for all of the mediawiki links
    for m in re.finditer(mwImagePattern, mediawikiText):
        print ("processing image link",m.group(0))
        rstLink = generateRstImageLink(m,relImagePath)
        imgLinkDict[m.group(2)] = rstLink
        print (rstLink)

    #for all of the rst figure links
    replacements = []
    for m in re.finditer(rstImagePattern, retRstText):
        rstLink = imgLinkDict[m.group(1)]
        replacements.append((m.start(), m.end(), m.group(1), rstLink))

    #perform replacements in reverse order
    for (start,end,imagefile,rstLink) in reversed(replacements):
        retRstText = retRstText[0:start] + rstLink + retRstText[end:]

    replacements = []
    for m in re.finditer(rstSubstitutionImagePattern, retRstText):
        rstLink = imgLinkDict[m.group(2)]
        rstLink = cleanInlineImagesDefinition(rstLink)
        replacements.append((m.start(), m.end(), m.group(2), rstLink))

    #perform replacements in reverse order
    for (start,end,imagefile,rstLink) in reversed(replacements):
        retRstText = retRstText[0:start] + rstLink + retRstText[end:]

    #get all of the image files
    for imageName in imgLinkDict.keys():
        saveImageFromMW(imageName,imagePath)


    return retRstText

def saveImageFromMW(imageName,path):
    url =  "http://www.mantidproject.org/File:"+imageName.replace(" ","_")
    print "Downloading image: ", url
    imagePage = readWebPage(url)
    mwImagePattern = re.compile(r'<div class="fullImageLink" id="file"><a href="([\/\w\.\-]+)">')

    imagePath = path + "/" + imageName.replace(" ","")

    match = re.search(mwImagePattern,imagePage)
    if match is not None:
        imageURL = match.group(1)
        imageURL = "http://www.mantidproject.org" + imageURL
        if not os.path.exists(imagePath):
            print "saving ", imageName, "to", imagePath
            downloadImage(imageURL,imagePath)

def generateRstImageLink(match,relImagePath):
    link = "image:: " + relImagePath+ "/" + match.group(2) + "\n"
    for i in range(3,len(match.groups())):
        if match.group(i) is None:
            break
        #strip off the first character as it is the | pipe
        imageOption = addImageOption(match.group(i)[1:])
        if imageOption is not None:
            link += "\t\t\t" + imageOption + "\n"
    return link

def addImageOption(mwImageOption):
    imageOption = mwImageOption.strip()
    if len(imageOption)>0:
        if imageOption.endswith("px"):
            return ":width: " + imageOption
        elif imageOption in ["right","left","middle","centre"]:
            return ":align: " + imageOption
        else:
            return ":alt: " + imageOption

def cleanInlineImagesDefinition(rstLink):
    match = re.search(r'^\s+:align:\s+\w+\s*$',rstLink,re.MULTILINE)
    if match is not None:
        #take the align out
        rstLink = rstLink[0:match.start()] + rstLink[match.end()+1:]
        #then add it at the end as a comment
        rstLink += ".. FIXME (inline definition does not allow align)" + match.group(0)
    return rstLink

def ensureDirectoriesExist(path):
    try:
        os.makedirs(path)
    except OSError:
        pass

def addLocalLinks(rstText,list,prefix):
    retRstText = rstText
    #build regex string for simple words
    regex = r"[^`<]((\*{0,2})(\b" + r"\b|\b".join(list) + r"\b)\2)[^`_]"
    pattern = re.compile(regex)

    replacements = []
    for m in re.finditer(pattern, retRstText):
        rstLink = ":ref:`" + m.group(3) + " <" + prefix + m.group(3) + ">`"
        print ("processing new link",m.group(1), rstLink)
        replacements.append((m.start(1), m.end(1), m.group(1), rstLink))

    #perform replacements in reverse order
    for (start,end,item,rstLink) in reversed(replacements):
        retRstText = retRstText[0:start] + rstLink + retRstText[end:]

    #build regex string for links
    regexLink = r"`(.+?)<(\b" + r"\b|\b".join(list) + r"\b)>`__"
    patternLink = re.compile(regexLink)
    replacements = []
    for m in re.finditer(patternLink, retRstText):
        rstLink = ":ref:`" + m.group(1) + " <" + prefix + m.group(2) + ">`"
        print ("processing existing link",m.group(0), rstLink)
        replacements.append((m.start(), m.end(), m.group(0), rstLink))

    #perform replacements in reverse order
    for (start,end,item,rstLink) in reversed(replacements):
        retRstText = retRstText[0:start] + rstLink + retRstText[end:]

    return retRstText

def fixUnderscoresInRefLinks(rstText):
    retRstText = rstText
    retRstText = re.sub(r"\b\\\_","_",retRstText)

    return retRstText

################################################################################################################

parser = argparse.ArgumentParser(description='Converts mediawiki pages to rst pages for sphinx.')
parser.add_argument('inputURL', help='the url of a mediawiki page')
parser.add_argument('-o', '--o',
                    help='Provide a path to output to a file')
parser.add_argument('-i', '--i',
                    help='Provide a relative path from the file to the images directory')
parser.add_argument('-rp', '--rp',
                    help='Provide a reference link prefix')
parser.add_argument('-r', '--r',
                    help='Provide a reference link')

args = parser.parse_args()
print args.inputURL
print args.o

urlSegments = urlparse.urlparse(args.inputURL)
pageName = urlSegments.path[1:]
print "Parsing ", pageName

#run pandoc and get the output in rst
mediawikiText = readWebPage(convertURLToRaw(args.inputURL))

#run pandoc and get the output in rst
rstText = covertMediaWikiToRST(convertURLToRaw(args.inputURL))

#print "*****************"
#print rstText
#print "*****************"

relPath = "../images"
if args.i is not None:
    relPath = os.path.dirname(args.i)
outDir = os.curdir
if args.o is not None:
    outDir = os.path.dirname(args.o)

imagePath = os.path.join(outDir,relPath)
imagePath = os.path.abspath(imagePath)
print imagePath, outDir

ensureDirectoriesExist(imagePath)
ensureDirectoriesExist(outDir)

#perform any processing beyond pandoc
rstText = fixUnderscoresInRefLinks(rstText)

rstText = processImageLinks(mediawikiText,rstText, imagePath, relPath)

wsList = ['EventWorkspace','MatrixWorkspace','PeaksWorkspace','MDWorkspace','Table Workspaces','WorkspaceGroup',
          'RectangularDetector','RAW File','Facilities File','FitConstraint','Framework Manager',
          'Instrument Data Service','InstrumentDefinitionFile','InstrumentParameterFile','Properties File',
          'MDHistoWorkspace','MDNorm','Nexus file','PeaksWorkspace','Point and space groups','RAW File'
          'Shared Pointer','Symmetry groups','Unit Factory','UserAlgorithms','Workflow Algorithm',
          'Workspace','Workspace2D','WorkspaceGroup']
rstText = addLocalLinks(rstText,wsList,"")

algList = mantid.AlgorithmFactory.getRegisteredAlgorithms(False).keys()
rstText = addLocalLinks(rstText,algList,"algm-")

funcList = mantid.FunctionFactory.getFunctionNames()
rstText = addLocalLinks(rstText,funcList,"func-")

#add reference name
refPrefix = "train-"
if args.rp is not None:
    refPrefix = args.rp
refName = pageName
if args.r is not None:
    refName = args.r
rstText = ".. _" + refPrefix + refName + ":\n\n" + rstText

#save output
if args.o is not None:
    sys.stdout = open(args.o, 'w')
print rstText