//---------------------------------------------------------------------- // Includes //---------------------------------------------------------------------- #include "MantidKernel/MultiFileNameParser.h" #include "MantidKernel/ConfigService.h" #include "MantidKernel/InstrumentInfo.h" #include "MantidKernel/Exception.h" #include <algorithm> #include <numeric> #include <iterator> #include <cassert> #include <ctype.h> #include <boost/regex.hpp> #include <boost/tokenizer.hpp> #include <boost/algorithm/string.hpp> #include <boost/lexical_cast.hpp> #include <boost/regex.hpp> #include <boost/bind.hpp> #include <Poco/Path.h> namespace Mantid { namespace Kernel { namespace MultiFileNameParsing { ///////////////////////////////////////////////////////////////////////////// // Static constants. ///////////////////////////////////////////////////////////////////////////// namespace Regexs { const std::string INST = "([A-Za-z]+|PG3|pg3)"; const std::string UNDERSCORE = "(_{0,1})"; const std::string SPACE = "(\\s*)"; const std::string COMMA = "(" + SPACE + "," + SPACE + ")"; const std::string PLUS = "(" + SPACE + "\\+" + SPACE + ")"; const std::string MINUS = "(" + SPACE + "\\-" + SPACE + ")"; const std::string COLON = "(" + SPACE + ":" + SPACE + ")"; const std::string SINGLE = "([0-9]+)"; const std::string RANGE = "(" + SINGLE + COLON + SINGLE + ")"; const std::string STEP_RANGE = "(" + SINGLE + COLON + SINGLE + COLON + SINGLE + ")"; const std::string ADD_LIST = "(" + SINGLE + "(" + PLUS + SINGLE + ")+" + ")"; const std::string ADD_RANGE = "(" + SINGLE + MINUS + SINGLE + ")"; const std::string ADD_STEP_RANGE = "(" + SINGLE + MINUS + SINGLE + COLON + SINGLE + ")"; const std::string ANY = "(" + ADD_STEP_RANGE + "|" + ADD_RANGE + "|" + ADD_LIST + "|" + STEP_RANGE + "|" + RANGE + "|" + SINGLE + ")"; const std::string LIST = "(" + ANY + "(" + COMMA + ANY + ")*" + ")"; } ///////////////////////////////////////////////////////////////////////////// // Forward declarations. ///////////////////////////////////////////////////////////////////////////// namespace { // Anonymous helper functions. std::vector<std::vector<unsigned int> > & parseToken(std::vector<std::vector<unsigned int> > & parsedRuns, const std::string & token); std::vector<std::vector<unsigned int> > generateRange(unsigned int from, unsigned int to, unsigned int stepSize, bool addRuns); void validateToken(const std::string & token); bool matchesFully(const std::string & stringToMatch, const std::string & regexString); std::string getMatchingString(const std::string & regexString, const std::string & toParse); std::string pad(std::string run, int count); } ///////////////////////////////////////////////////////////////////////////// // Scoped, global functions. ///////////////////////////////////////////////////////////////////////////// /** * Parses a string containing a comma separated list of run "tokens", where each run * token is of one of the allowed forms (a single run or a range of runs or an added * range of runs, etc.) * * @param runString :: a string containing the runs to parse, in the correct format. * * @returns a vector of vectors of unsigned ints, one int for each run, where runs * to be added are contained in the same sub-vector. * @throws std::runtime_error when runString provided is in an incorrect format. */ std::vector<std::vector<unsigned int> > parseMultiRunString(std::string runString) { // If the run string is empty, return no runs. if(runString.empty()) return std::vector<std::vector<unsigned int> >(); // Remove whitespace. runString.erase(std::remove_if( // ("Erase-remove" idiom.) runString.begin(), runString.end(), isspace), runString.end()); // Only numeric characters, or occurances of plus, minus, comma and colon are allowed. if(!matchesFully(runString,"([0-9]|\\+|\\-|,|:)+")) throw std::runtime_error( "Non-numeric or otherwise unaccetable character(s) detected."); // Tokenize on commas. std::vector<std::string> tokens; tokens = boost::split(tokens, runString, boost::is_any_of(",")); // Validate each token. std::for_each( tokens.begin(), tokens.end(), validateToken); // Parse each token, accumulate the results, and return them. return std::accumulate( tokens.begin(), tokens.end(), std::vector<std::vector<unsigned int> >(), parseToken); } ///////////////////////////////////////////////////////////////////////////// // Public member functions of Parser class. ///////////////////////////////////////////////////////////////////////////// /// Constructor. Parser::Parser() : m_runs(), m_fileNames(), m_multiFileName(), m_dirString(), m_instString(), m_underscoreString(), m_runString(), m_extString(), m_zeroPadding() {} /// Destructor. Parser::~Parser() {} /** * Takes the given multiFileName string, and calls other parts of the parser * to generate a corresponding vector of vectors of file names. * * @param multiFileName :: the string containing the multiple file names to be parsed. */ void Parser::parse(const std::string & multiFileName) { // Clear any contents of the member variables. clear(); // Set the string to parse. m_multiFileName = multiFileName; // Split the string to be parsed into sections, and do some validation. split(); // Parse the run section into unsigned ints we can use. m_runs = parseMultiRunString(m_runString); // Set up helper functor. GenerateFileName generateFileName( m_dirString + m_instString + m_underscoreString, m_extString, m_zeroPadding); // Generate complete file names for each run using helper functor. std::transform( m_runs.begin(), m_runs.end(), std::back_inserter(m_fileNames), generateFileName); } ///////////////////////////////////////////////////////////////////////////// // Private member functions of Parser class. ///////////////////////////////////////////////////////////////////////////// /** * Clears all member variables. */ void Parser::clear() { m_runs.clear(); m_fileNames.clear(); m_multiFileName.clear(); m_dirString.clear(); m_instString.clear(); m_underscoreString.clear(); m_runString.clear(); m_extString.clear(); } /** * Splits up the m_multiFileName string into component parts, to be used elsewhere by * the parser. Some validation is done here, and exceptions thrown if required * components are missing. * * @throws std::runtime_error if a required component is not present in the string. */ void Parser::split() { if(m_multiFileName.empty()) throw std::runtime_error("No file name to parse."); // (We shun the use of Poco::File here as it is unable to deal with certain // combinations of special characters, for example double commas.) // Get the extension, if there is one. size_t lastDot = m_multiFileName.find_last_of("."); if(lastDot != std::string::npos) m_extString = m_multiFileName.substr(lastDot); // Get the directory, if there is one. size_t lastSeparator = m_multiFileName.find_last_of("/\\"); if(lastSeparator != std::string::npos) m_dirString = m_multiFileName.substr(0, lastSeparator); // Slice off the directory and extension. std::string base = m_multiFileName.substr( m_dirString.size(), m_multiFileName.size() - (m_dirString.size() + m_extString.size())); // Get the instrument name using a regex. Throw if not found since this is required. m_instString = getMatchingString("^" + Regexs::INST, base); if(m_instString.empty()) throw std::runtime_error("There does not appear to be an instrument name present."); // Check if instrument exists, if not then clear the parser, and rethrow an exception. try { InstrumentInfo instInfo = ConfigService::Instance().getInstrument(m_instString); m_zeroPadding = instInfo.zeroPadding(); } catch (const Exception::NotFoundError &) { throw std::runtime_error("There does not appear to be a valid instrument name present."); } // Check for an underscore after the instrument name. size_t underscore = base.find_first_of("_"); if(underscore == m_instString.size()) m_underscoreString = "_"; // We can now deduce the run string. Throw if not found since this is required. m_runString = base.substr(m_underscoreString.size() + m_instString.size()); if(m_instString.empty()) throw std::runtime_error("There does not appear to be any runs present."); } ///////////////////////////////////////////////////////////////////////////// // Helper functor. ///////////////////////////////////////////////////////////////////////////// /** * Constructor, to accept state used in generating file names. * * @param prefix :: a string that prefixes the generated file names. * @param suffix :: a string that suffixes the generated file names. * @param zeroPadding :: the number of zeros with which to pad the run number of genrerated file names. */ GenerateFileName::GenerateFileName(const std::string & prefix, const std::string & suffix, int zeroPadding) : m_prefix(prefix), m_suffix(suffix), m_zeroPadding(zeroPadding) {} /** * Overloaded function operator that takes in a vector of runs, and returns a vector of file names. * * @param runs :: the vector of runs with which to make file names. * * @returns the generated vector of file names. */ std::vector<std::string> GenerateFileName::operator()(const std::vector<unsigned int> & runs) { std::vector<std::string> fileNames; std::transform( runs.begin(), runs.end(), std::back_inserter(fileNames), (*this) // Call other overloaded function operator. ); return fileNames; } /** * Overloaded function operator that takes in a runs, and returns a file name. * * @param runs :: the vector of runs with which to make file names. * * @returns the generated vector of file names. */ std::string GenerateFileName::operator()(unsigned int run) { std::stringstream fileName; fileName << m_prefix << pad(boost::lexical_cast<std::string>(run), m_zeroPadding) << m_suffix; return fileName.str(); } ///////////////////////////////////////////////////////////////////////////// // Anonymous helper functions. ///////////////////////////////////////////////////////////////////////////// namespace // anonymous { /** * Parses a string containing a run "token". * * Note that this function takes the form required by the "accumulate" algorithm: * it takes in the parsed runs so far and a new token to parse, and then returns * the result of appending the newly parsed token to the already parsed runs. * * @param parsedRuns :: the vector of vectors of runs parsed so far. * @param token :: the token to parse. * * @returns the newly parsed runs appended to the previously parsed runs. * @throws std::runtime_error if */ std::vector<std::vector<unsigned int> > & parseToken( std::vector<std::vector<unsigned int> > & parsedRuns, const std::string & token) { // Tokenise further, on plus, minus or colon. std::vector<std::string> subTokens; subTokens = boost::split(subTokens, token, boost::is_any_of("+-:")); std::vector<unsigned int> rangeDetails; // Convert the sub tokens to uInts. std::transform( subTokens.begin(), subTokens.end(), std::back_inserter(rangeDetails), boost::lexical_cast<unsigned int, std::string>); // We should always end up with at least 1 unsigned int here. assert(1 <= rangeDetails.size()); std::vector<std::vector<unsigned int> > runs; // E.g. "2012". if(matchesFully(token, Regexs::SINGLE)) { runs.push_back(std::vector<unsigned int>(1, rangeDetails[0])); } // E.g. "2012:2020". else if(matchesFully(token, Regexs::RANGE)) { runs = generateRange( rangeDetails[0], rangeDetails[1], 1, false ); } // E.g. "2012:2020:4". else if(matchesFully(token, Regexs::STEP_RANGE)) { runs = generateRange( rangeDetails[0], rangeDetails[1], rangeDetails[2], false ); } // E.g. "2012+2013+2014+2015". else if(matchesFully(token, Regexs::ADD_LIST)) { // No need to generate the range here, it's already there for us. runs = std::vector<std::vector<unsigned int> >(1, rangeDetails); } // E.g. "2012-2020". else if(matchesFully(token, Regexs::ADD_RANGE)) { runs = generateRange( rangeDetails[0], rangeDetails[1], 1, true ); } // E.g. "2012-2020:4". else if(matchesFully(token, Regexs::ADD_STEP_RANGE)) { runs = generateRange( rangeDetails[0], rangeDetails[1], rangeDetails[2], true ); } else { // We should never reach here - the validation done on the token previously // should prevent any other possible scenario. assert(false); } // Add the runs on to the end of parsedRuns, and return it. std::copy( runs.begin(), runs.end(), std::back_inserter(parsedRuns)); return parsedRuns; } /** * Generates a range of runs between the given numbers, increasing * or decreasing by the given step size. If addRuns is true, then the * runs will all be in the same sub vector, if false they will each be * in their own vector. * * @param from :: the start of the range * @param to :: the end of the range * @param stepSize :: the size of the steps with which to increase/decrease * @param addRuns :: whether or not to add the runs together (place in sume sub-vector) * * @returns a vector of vectors of runs. * @throws std::runtime_error if a step size of zero is specified. */ std::vector<std::vector<unsigned int> > generateRange( unsigned int from, unsigned int to, unsigned int stepSize, bool addRuns) { if(stepSize == 0) throw std::runtime_error( "Unable to generate a range with a step size of zero."); unsigned int currentRun = from; std::vector<std::vector<unsigned int> > runs; // If ascending range if(from <= to) { while(currentRun <= to) { if(addRuns) { if(runs.empty()) runs.push_back(std::vector<unsigned int>(1, currentRun)); else runs.at(0).push_back(currentRun); } else { runs.push_back(std::vector<unsigned int>(1, currentRun)); } currentRun += stepSize; } } // Else descending range else { while(currentRun >= to) { if(addRuns) { if(runs.empty()) runs.push_back(std::vector<unsigned int>(1, currentRun)); else runs.at(0).push_back(currentRun); } else { runs.push_back(std::vector<unsigned int>(1, currentRun)); } // Guard against case where stepSize would take us into negative // numbers (not supported by unsigned ints ...). if(static_cast<int>(currentRun) - static_cast<int>(stepSize) < 0) break; currentRun -= stepSize; } } return runs; } /** * Validates the given run token. * * @param :: the run token to validate. * * @throws std::runtime_error if the token is of an incorrect form. */ void validateToken(const std::string & token) { // Each token must be non-empty. if(token.size() == 0) throw std::runtime_error( "A comma-separated token is empty."); // Each token must begin and end with a numeric character. if(!matchesFully(token, "[0-9].+[0-9]|[0-9]")) throw std::runtime_error( "The token \"" + token + "\" is of an incorrect form. Does it begin or end with a plus, minus or colon?"); // Each token must be one of the acceptable forms, i.e. a single run, an added range of runs, etc. if(!matchesFully(token, Regexs::ANY)) throw std::runtime_error( "The token \"" + token + "\" is of an incorrect form."); } /** * Convenience function that matches a *complete* given string to the given regex. * * @param stringToMatch :: the string to match with the given regex. * @param regexString :: the regex with which to match the given string. * * @returns true if the string matches fully, or false otherwise. */ bool matchesFully(const std::string & stringToMatch, const std::string & regexString) { const boost::regex regex("^(" + regexString + "$)"); return boost::regex_match(stringToMatch, regex); } /** * Finds a given regex in a given string, and returns the part of the string * that matches the regex. Returns "" if there is no occurance of the regex. * * @param regex - the regular expression to find within the string * @param toParse - the string within to find the regex * * @returns the part (if any) of the given string that matches the given regex */ std::string getMatchingString(const std::string & regexString, const std::string & toParse) { boost::sregex_iterator it( toParse.begin(), toParse.end(), boost::regex(regexString) ); if(it == boost::sregex_iterator()) return ""; return it->str(); } /** * Zero pads the run number used in a file name to required length. * * @param run - the run number of the file. May as well pass by value here. * @param count - the required length of the string. * * @returns the string, padded to the required length. * @throws std::runtime_error if run is longer than size of count. */ std::string pad(std::string run, int count) { if(run.size() < count) return run.insert(0, count - run.size(), '0'); else if(run.size() > count) throw std::runtime_error("Could not parse run number \"" + run + "\" since the instrument run number length required is " + boost::lexical_cast<std::string>(count)); return run; } } // anonymous namespace } // namespace MultiFileNameParsing } // namespace Kernel } // namespace Mantid