Skip to content
Snippets Groups Projects
FileFinder.cpp 26 KiB
Newer Older
// Mantid Repository : https://github.com/mantidproject/mantid
//
// Copyright © 2018 ISIS Rutherford Appleton Laboratory UKRI,
//     NScD Oak Ridge National Laboratory, European Spallation Source
//     & Institut Laue - Langevin
// SPDX - License - Identifier: GPL - 3.0 +
//----------------------------------------------------------------------
// Includes
//----------------------------------------------------------------------
#include "MantidAPI/FileFinder.h"
#include "MantidAPI/ArchiveSearchFactory.h"
#include "MantidAPI/FrameworkManager.h"
#include "MantidAPI/IArchiveSearch.h"
#include "MantidKernel/ConfigService.h"
#include "MantidKernel/FacilityInfo.h"
#include "MantidKernel/InstrumentInfo.h"
#include "MantidKernel/Strings.h"
#include <MantidKernel/StringTokenizer.h>
#include <Poco/Exception.h>
#include <Poco/File.h>
#include <Poco/Path.h>
#include <boost/lexical_cast.hpp>
#include <boost/regex.hpp>

#include <algorithm>
#include <cctype>
#include <boost/algorithm/string.hpp>

namespace {
/// static logger object
Mantid::Kernel::Logger g_log("FileFinder");

/**
 * Unary predicate for use with remove_if.  Checks for the existance of
 * a "*" wild card in the file extension string passed to it.
 *
 * @param ext :: the extension to check.
 *
 * @returns true if extension contains a "*", else false.
 */
bool containsWildCard(const std::string &ext) {
  return std::string::npos != ext.find('*');
} // namespace
namespace Mantid {
namespace API {
using std::string;

// this allowed string could be made into an array of allowed, currently used
// only by the ISIS SANS group
const std::string FileFinderImpl::ALLOWED_SUFFIX = "-add";
//----------------------------------------------------------------------
// Public member functions
//----------------------------------------------------------------------
/**
 * Default constructor
 */
FileFinderImpl::FileFinderImpl() {
  // Make sure plugins are loaded
  FrameworkManager::Instance().loadPlugins();
// determine from Mantid property how sensitive Mantid should be
  m_globOption = Poco::Glob::GLOB_DEFAULT;
  setCaseSensitive(Kernel::ConfigService::Instance()
                       .getValue<bool>("filefinder.casesensitive")
                       .get_value_or(false));
/**
 * Option to set if file finder should be case sensitive
 * @param cs :: If true then set to case sensitive
 */
void FileFinderImpl::setCaseSensitive(const bool cs) {
  if (cs)
    m_globOption = Poco::Glob::GLOB_DEFAULT;
  else
    m_globOption = Poco::Glob::GLOB_CASELESS;
}
/**
 * Option to get if file finder should be case sensitive
 * @return cs :: If case sensitive return true, if not case sensitive return
 * false
 */
bool FileFinderImpl::getCaseSensitive() const {
  return (m_globOption == Poco::Glob::GLOB_DEFAULT);
}
/**
 * Return the full path to the file given its name
 * @param filename :: A file name (without path) including extension
 * @param ignoreDirs :: If true, directories that match are skipped unless the
 * path given is already absolute
 * @return The full path if the file exists and can be found in one of the
 * search locations
 *  or an empty string otherwise.
 */
std::string FileFinderImpl::getFullPath(const std::string &filename,
                                        const bool ignoreDirs) const {
  return Kernel::ConfigService::Instance().getFullPath(filename, ignoreDirs,
                                                       m_globOption);
/** Run numbers can be followed by an allowed string. Check if there is
 *  one, remove it from the name and return the string, else return empty
 *  @param userString run number that may have a suffix
 *  @return the suffix, if there was one
 */
std::string
FileFinderImpl::extractAllowedSuffix(std::string &userString) const {
  if (userString.find(ALLOWED_SUFFIX) == std::string::npos) {
    // short cut processing as normally there is no suffix
    return "";
  }
  // ignore any file extension in checking if a suffix is present
  Poco::Path entry(userString);
  std::string noExt(entry.getBaseName());
  const size_t repNumChars = ALLOWED_SUFFIX.size();
  if (noExt.find(ALLOWED_SUFFIX) == noExt.size() - repNumChars) {
    userString.replace(userString.size() - repNumChars, repNumChars, "");
    return ALLOWED_SUFFIX;
  }
  return "";
}
/**
 * Return the InstrumentInfo as determined from the hint.
 *
 * @param hint :: The name hint.
 * @return This will return the default instrument if it cannot be determined.
 */
const Kernel::InstrumentInfo
FileFinderImpl::getInstrument(const string &hint) const {
  if ((!hint.empty()) && (!isdigit(hint[0]))) {
    string instrName(hint);
    Poco::Path path(instrName);
    instrName = path.getFileName();
    if ((instrName.find("PG3") == 0) || (instrName.find("pg3") == 0)) {
      instrName = "PG3";
    // We're extending this nasty hack to accomodate data archive searching for
    // SANS2D.
    // While this certainly shouldn't be considered good practice, #7515 exists
    // to
    // completely redesign FileFinder -- this quick fix will have to do until
    // all this
    // code gets an overhaul as part of that ticket.  Please think twice before
    // adding
    // any more instruments to this list.
    else if ((instrName.find("SANS2D") == 0) ||
             (instrName.find("sans2d") == 0)) {
      instrName = "SANS2D";
    } else {
      // go forwards looking for the run number to start
        string::const_iterator it = std::find_if(
            instrName.begin(), instrName.end(), std::ptr_fun(isdigit));
        std::string::size_type nChars = std::distance(
            static_cast<string::const_iterator>(instrName.begin()), it);
        instrName = instrName.substr(0, nChars);
      // go backwards looking for the instrument name to end - gets around
      // delimiters
      if (!instrName.empty()) {
        string::const_reverse_iterator it = std::find_if(
            instrName.rbegin(), instrName.rend(), std::ptr_fun(isalpha));
        string::size_type nChars = std::distance(
            it, static_cast<string::const_reverse_iterator>(instrName.rend()));
        instrName = instrName.substr(0, nChars);
    }
    try {
      const Kernel::InstrumentInfo instrument =
          Kernel::ConfigService::Instance().getInstrument(instrName);
      return instrument;
    } catch (Kernel::Exception::NotFoundError &e) {
      g_log.debug() << e.what() << "\n";
    }
  }
  return Kernel::ConfigService::Instance().getInstrument();
}
/**
 * Extracts the instrument name and run number from a hint
 * @param hint :: The name hint
 * @return A pair of instrument name and run number
 */
std::pair<std::string, std::string>
FileFinderImpl::toInstrumentAndNumber(const std::string &hint) const {
  // g_log.debug() << "toInstrumentAndNumber(" << hint << ")\n";
  std::string instrPart;
  std::string runPart;

  if (isdigit(hint[0])) {
    instrPart = Kernel::ConfigService::Instance().getInstrument().shortName();
    runPart = hint;
  } else {
    /// Find the last non-digit as the instrument name can contain numbers
    std::string::const_reverse_iterator it = std::find_if(
        hint.rbegin(), hint.rend(), std::not1(std::ptr_fun(isdigit)));
    // No non-digit or all non-digits
    if (it == hint.rend() || it == hint.rbegin()) {
      throw std::invalid_argument(
          "Malformed hint to FileFinderImpl::makeFileName: " + hint);
    }
    std::string::size_type nChars = std::distance(it, hint.rend());
    // Add in special test for PG3
    if (boost::algorithm::istarts_with(hint, "PG3")) {
      instrPart = "PG3";
      nChars = instrPart.length();
    }
    // Another nasty check for SANS2D.  Will do until FileFinder redesign.
    else if (boost::algorithm::istarts_with(hint, "SANS2D")) {
      instrPart = "SANS2D";
      nChars = instrPart.length();
    } else {
      instrPart = hint.substr(0, nChars);
    }
    runPart = hint.substr(nChars);
  }
  unsigned int irunPart(0);
  try {
    irunPart = boost::lexical_cast<unsigned int>(runPart);
  } catch (boost::bad_lexical_cast &) {
    std::ostringstream os;
    os << "Cannot convert '" << runPart << "' to run number.";
    throw std::invalid_argument(os.str());
  }
  Kernel::InstrumentInfo instr =
      Kernel::ConfigService::Instance().getInstrument(instrPart);
  size_t nZero = instr.zeroPadding(irunPart);
  // remove any leading zeros in case there are too many of them
  std::string::size_type i = runPart.find_first_not_of('0');
  runPart.erase(0, i);
  while (runPart.size() < nZero)
    runPart.insert(0, "0");
  if (runPart.size() > nZero && nZero != 0) {
    throw std::invalid_argument(
        "Run number does not match instrument's zero padding");
  }
  instrPart = instr.filePrefix(irunPart);
  return std::make_pair(instrPart, runPart);
}
/**
 * Make a data file name (without extension) from a hint. The hint can be either
 * a run number or
 * a run number prefixed with an instrument name/short name. If the instrument
 * name is absent the default one is used.
 * @param hint :: The name hint
 * @param instrument :: The current instrument object
 * @return The file name
 * @throw NotFoundError if a required default is not set
 * @throw std::invalid_argument if the argument is malformed or run number is
 * too long
 */
std::string
FileFinderImpl::makeFileName(const std::string &hint,
                             const Kernel::InstrumentInfo &instrument) const {
  // g_log.debug() << "makeFileName(" << hint << ", " << instrument.shortName()
  // << ")\n";
  if (hint.empty())
    return "";

  std::string filename(hint);
  const std::string suffix = extractAllowedSuffix(filename);
  const std::string shortName = instrument.shortName();
  std::string delimiter = instrument.delimiter();

  // see if starts with the provided instrument name
  if (filename.substr(0, shortName.size()) == shortName) {
    filename = filename.substr(shortName.size());
    if ((!delimiter.empty()) &&
        (filename.substr(0, delimiter.size()) == delimiter))
      filename = filename.substr(delimiter.size());

    filename = shortName + filename;
  }
  std::pair<std::string, std::string> p = toInstrumentAndNumber(filename);
  filename = p.first;
  if (!delimiter.empty()) {
    filename += delimiter;
  }
  filename += p.second;
  if (!suffix.empty()) {
    filename += suffix;
  }
  return filename;
}
/**
 * Determine the extension from a filename.
 *
 * @param filename The filename to get the extension from.
 * @param exts The list of extensions to try before giving up and
 * using the default: whatever happens after the '.'.
 *
 * @return The extension. If one isn't determined it is an empty string.
 */
std::string
FileFinderImpl::getExtension(const std::string &filename,
                             const std::vector<std::string> &exts) const {
  g_log.debug() << "getExtension(" << filename << ", exts[" << exts.size()
                << "])\n";

  // go through the list of supplied extensions
  for (const auto &ext : exts) {
    std::string extension = toUpper(ext);
    if (extension.rfind('*') ==
        extension.size() - 1) // there is a wildcard at play
      extension = extension.substr(0, extension.rfind('*'));
    }
    std::size_t found = toUpper(filename).rfind(extension);
    if (found != std::string::npos) {
      g_log.debug() << "matched extension \"" << extension << "\" based on \""
      return filename.substr(found); // grab the actual extensions found
  g_log.debug() << "Failed to find extension. Just using last \'.\'\n";
  std::size_t pos = filename.find_last_of('.');
  if (pos != std::string::npos) {
    return filename.substr(pos);
  }
  // couldn't find an extension
  return "";
}
std::vector<IArchiveSearch_sptr>
FileFinderImpl::getArchiveSearch(const Kernel::FacilityInfo &facility) const {
  std::vector<IArchiveSearch_sptr> archs;

  // get the searchive option from config service and format it
  std::string archiveOpt =
      Kernel::ConfigService::Instance().getString("datasearch.searcharchive");
  std::transform(archiveOpt.begin(), archiveOpt.end(), archiveOpt.begin(),
                 tolower);

  // if it is turned off, not specified, or the facility doesn't have
  // IArchiveSearch defined, return an empty vector
  if (archiveOpt.empty() || archiveOpt == "off" ||
      facility.archiveSearch().empty())
    return archs;

  // determine if the user wants archive search for this facility
  bool createArchiveSearch = bool(archiveOpt == "all");

  // then see if the facility name appears in the list or if we just want the
  // default facility
  if (!createArchiveSearch) {
    std::string faciltyName = facility.name();
    std::transform(faciltyName.begin(), faciltyName.end(), faciltyName.begin(),
                   tolower);
    if (archiveOpt == "on") { // only default facilty
      std::string defaultFacility =
          Kernel::ConfigService::Instance().getString("default.facility");
      std::transform(defaultFacility.begin(), defaultFacility.end(),
                     defaultFacility.begin(), tolower);
      createArchiveSearch = bool(faciltyName == defaultFacility);
    } else { // everything in the list
      createArchiveSearch =
          bool(archiveOpt.find(faciltyName) != std::string::npos);
    }
  }

  // put together the list of IArchiveSearch to use
  if (createArchiveSearch) {
    for (const auto &facilityname : facility.archiveSearch()) {
      g_log.debug() << "get archive search for the facility..." << facilityname
                    << "\n";
      archs.push_back(ArchiveSearchFactory::Instance().create(facilityname));
    }
  }
  return archs;
}

std::string
FileFinderImpl::findRun(const std::string &hintstr,
                        const std::vector<std::string> &exts) const {
  std::string hint = Kernel::Strings::strip(hintstr);
  g_log.debug() << "vector findRun(\'" << hint << "\', exts[" << exts.size()
                << "])\n";

  // if partial filename or run number is not supplied, return here
  if (hint.empty())
    return "";

  // if it looks like a full filename just do a quick search for it
  Poco::Path hintPath(hint);
  if (!hintPath.getExtension().empty()) {
    // check in normal search locations
    g_log.debug() << "hintPath is not empty, check in normal search locations"
                  << "\n";
    std::string path = getFullPath(hint);
    if (!path.empty()) {
      try {
        if (Poco::File(path).exists()) {
          g_log.information() << "found path = " << path << '\n';
          return path;
      } catch (Poco::Exception &) {
    } else {
      g_log.debug() << "Unable to find files via directory search with the "
                       "filename that looks like a full filename"
                    << "\n";
    }
  }
  // get instrument and facility
  const Kernel::InstrumentInfo instrument = this->getInstrument(hint);
  const Kernel::FacilityInfo &facility = instrument.facility();
  // get facility extensions
  const std::vector<std::string> facility_extensions = facility.extensions();
  // select allowed extensions
  std::vector<std::string> extensions;

  g_log.debug() << "Add facility extensions defined in the Facility.xml file"
                << "\n";
  extensions.assign(facility_extensions.begin(), facility_extensions.end());

  // Do we need to try and form a filename from our preset rules
  std::string filename(hint);
  std::string extension = getExtension(hint, extensions);
  if (!extensions.empty())
    filename = hint.substr(0, hint.rfind(extension));
  if (hintPath.depth() == 0) {
    try {
      if (!facility.noFilePrefix()) {
        filename = makeFileName(filename, instrument);
      }
    } catch (std::invalid_argument &) {
      if (filename.length() >= hint.length()) {
        g_log.information() << "Could not form filename from standard rules '"
                            << filename << "'\n";
      }
    }
  }
  // Look first at the original filename then for case variations. This is
  // important
  // on platforms where file names ARE case sensitive.
  // Sorry for the duplication, a last minute fix was required. Ticket #6419 is
  // tasked with a redesign of
  // the whole file finding concept.

  std::set<std::string> filenames;
  filenames.insert(filename);
  if (!getCaseSensitive()) {
    std::string transformed(filename);
    std::transform(filename.begin(), filename.end(), transformed.begin(),
                   toupper);
    filenames.insert(transformed);
    std::transform(filename.begin(), filename.end(), transformed.begin(),
                   tolower);
    filenames.insert(transformed);
  }
  // Merge the extensions & throw out duplicates
  // On Windows throw out ones that only vary in case
  // std::vector<std::string> uniqueExts;
  // uniqueExts.reserve(1 + exts.size() + extensions.size());
  std::set<std::string> uniqueExtsSet;
  if (!extension.empty())
    uniqueExtsSet.insert(extension);
  getUniqueExtensions(exts, uniqueExtsSet);
  getUniqueExtensions(extensions, uniqueExtsSet);

  std::vector<std::string> uniqueExts;
  for (const auto &it : uniqueExtsSet) {
    uniqueExts.push_back(it);
  // determine which archive search facilities to use
  std::vector<IArchiveSearch_sptr> archs = getArchiveSearch(facility);

  std::string path = getPath(archs, filenames, uniqueExts);
  if (!path.empty()) {
    g_log.information() << "found path = " << path << '\n';
    return path;
  } else {
    g_log.information() << "Unable to find run with hint " << hint << "\n";
  }
  g_log.information() << "Unable to find file path for " << hint << "\n";
/**
 * Given a set of already determined extensions and new extensions,
 * create a set of all extensions.
 * If not in an extension-is-case-sensitive environment, only add the
 * lower case OR upper case version of the extension
 * @param exts :: a vector of extensions to add
 * @param uniqueExts :: a set of currently included extensions
 */
void FileFinderImpl::getUniqueExtensions(
    const std::vector<std::string> &exts,
    std::set<std::string> &uniqueExts) const {
  for (const auto &cit : exts) {
    if (!getCaseSensitive()) // prune case variations - this is a hack, see
                             // findRun
    {
      std::string transformed(cit);
      std::transform(cit.begin(), cit.end(), transformed.begin(), tolower);
      auto searchItr = uniqueExts.find(cit);
      if (searchItr != uniqueExts.end())
        continue;
      std::transform(cit.begin(), cit.end(), transformed.begin(), toupper);
      uniqueExts.insert(cit);
    } else {
      uniqueExts.insert(cit);
    }
  }
}

/**
 * Find a list of files file given a hint. Calls findRun internally.
 * @param hintstr :: Comma separated list of hints to findRun method.
 *  Can also include ranges of runs, e.g. 123-135 or equivalently 123-35.
 *  Only the beginning of a range can contain an instrument name.
 * @param exts :: Vector of allowed file extensions. Optional.
 *                If provided, this provides the only extensions searched for.
 *                If not provided, facility extensions used.
 * @return A vector of full paths or empty vector
 * @throw std::invalid_argument if the argument is malformed
 * @throw Exception::NotFoundError if a file could not be found
 */
std::vector<std::string>
FileFinderImpl::findRuns(const std::string &hintstr, const std::vector<std::string> &exts) const {
  std::string hint = Kernel::Strings::strip(hintstr);
  g_log.debug() << "findRuns hint = " << hint << "\n";
  std::vector<std::string> res;
  Mantid::Kernel::StringTokenizer hints(
      hint, ",",
      Mantid::Kernel::StringTokenizer::TOK_TRIM |
          Mantid::Kernel::StringTokenizer::TOK_IGNORE_EMPTY);
  static const boost::regex digits("[0-9]+");
  auto h = hints.begin();

  for (; h != hints.end(); ++h) {
    // Quick check for a filename
    bool fileSuspected = false;
    // Assume if the hint contains either a "/" or "\" it is a filename..
    if ((*h).find("\\") != std::string::npos) {
      fileSuspected = true;
    }
    if ((*h).find("/") != std::string::npos) {
      fileSuspected = true;
    }
    if ((*h).find(ALLOWED_SUFFIX) != std::string::npos) {
      fileSuspected = true;
    }
    Mantid::Kernel::StringTokenizer range(
        *h, "-",
        Mantid::Kernel::StringTokenizer::TOK_TRIM |
            Mantid::Kernel::StringTokenizer::TOK_IGNORE_EMPTY);
    if ((range.count() > 2) && (!fileSuspected)) {
      throw std::invalid_argument("Malformed range of runs: " + *h);
    } else if ((range.count() == 2) && (!fileSuspected)) {
      std::pair<std::string, std::string> p1 = toInstrumentAndNumber(range[0]);
      std::string run = p1.second;
      size_t nZero = run.size(); // zero padding
      if (range[1].size() > nZero) {
        throw std::invalid_argument("Malformed range of runs: " + *h +
                                    ". The end of string value is longer than "
                                    "the instrument's zero padding");
      int runNumber = boost::lexical_cast<int>(run);
      std::string runEnd = run;
      // Adds zero padding to end of range.
      runEnd.replace(runEnd.end() - range[1].size(), runEnd.end(), range[1]);

      // Throw if runEnd contains something else other than a digit.
      if (!boost::regex_match(runEnd, digits))
        throw std::invalid_argument("Malformed range of runs: Part of the run "
                                    "has a non-digit character in it.");

      int runEndNumber = boost::lexical_cast<int>(runEnd);
      if (runEndNumber < runNumber) {
        throw std::invalid_argument("Malformed range of runs: " + *h);
      }
      for (int irun = runNumber; irun <= runEndNumber; ++irun) {
        run = std::to_string(irun);
        while (run.size() < nZero)
          run.insert(0, "0");
        std::string path = findRun(p1.first + run, exts);
        if (!path.empty()) {
          res.push_back(path);
        } else {
          throw Kernel::Exception::NotFoundError("Unable to find file:", run);
      std::string path = findRun(*h, exts);
      if (!path.empty()) {
        res.push_back(path);
      } else {
        throw Kernel::Exception::NotFoundError("Unable to find file:", *h);
/**
 * Return the path to the file found in archive
 * @param archs :: A list of archives to search
 * @param filenames :: A list of filenames (without extensions) to pass to the
 * archive
 * @param exts :: A list of extensions to check for in turn against each file
 * @return The full path if the file exists and can be found in one of the
 * search locations
 *  or an empty string otherwise.
 */
std::string
FileFinderImpl::getArchivePath(const std::vector<IArchiveSearch_sptr> &archs,
                               const std::set<std::string> &filenames,
                               const std::vector<std::string> &exts) const {
  g_log.debug() << "getArchivePath([IArchiveSearch_sptr], [ ";
  for (const auto &iter : filenames)
    g_log.debug() << iter << " ";
  g_log.debug() << "], [ ";
  for (const auto &iter : exts)
    g_log.debug() << iter << " ";
  g_log.debug() << "])\n";

  for (const auto &arch : archs) {
      g_log.debug() << "Getting archive path for requested files\n";
      path = arch->getArchivePath(filenames, exts);
      if (!path.empty()) {
        return path;
    } catch (...) {
/**
 * Return the full path to the file given its name, checking local directories
 * first.
 * @param archs :: A list of archives to search
 * @param filenames :: A list of filenames (without extensions) to pass to the
 * archive
 * @param exts :: A list of extensions to check for in turn against each file
 * @return The full path if the file exists and can be found in one of the
 * search locations
 *  or an empty string otherwise.
 */
std::string
FileFinderImpl::getPath(const std::vector<IArchiveSearch_sptr> &archs,
                        const std::set<std::string> &filenames,
                        const std::vector<std::string> &exts) const {
  std::string path;

  std::vector<std::string> extensions;
  extensions.assign(exts.begin(), exts.end());

  // Remove wild cards.
  extensions.erase(
      std::remove_if(extensions.begin(), extensions.end(), containsWildCard),
      extensions.end());

  const std::vector<std::string> &searchPaths =
      Kernel::ConfigService::Instance().getDataSearchDirs();

  // Before we try any globbing, make sure we exhaust all reasonable attempts at
  // constructing the possible filename.
  // Avoiding the globbing of getFullPath() for as long as possible will help
  // performance when calling findRuns()
  // with a large range of files, especially when searchPaths consists of
  // folders containing a large number of runs.
  for (auto &extension : extensions) {
    for (const auto &filename : filenames) {
      for (const auto &searchPath : searchPaths) {
          Poco::Path path(searchPath, filename + extension);
          Poco::File file(path);
          if (file.exists())
            return path.toString();

        } catch (Poco::Exception &) { /* File does not exist, just carry on. */
  for (const auto &extension : extensions) {
    for (const auto &filename : filenames) {
      path = getFullPath(filename + extension);
      try {
        if (!path.empty() && Poco::File(path).exists()) {
          g_log.debug() << "path returned from getFullPath() = " << path
                        << '\n';
          return path;
      } catch (std::exception &e) {
        g_log.error() << "Cannot open file " << path << ": " << e.what()
                      << '\n';
        return "";
  // Search the archive
    g_log.debug() << "Search the archives\n";
    std::string path = getArchivePath(archs, filenames, exts);
    try {
      if (!path.empty() && Poco::File(path).exists()) {
        return path;
    } catch (std::exception &e) {
      g_log.error() << "Cannot open file " << path << ": " << e.what() << '\n';
  return "";
}

std::string FileFinderImpl::toUpper(const std::string &src) const {
  std::string result = src;
  std::transform(result.begin(), result.end(), result.begin(), toupper);
  return result;
}
} // namespace API
} // namespace Mantid