Loading ess-dive-meta.py 100644 → 100755 +53 −15 Original line number Diff line number Diff line Loading @@ -14,6 +14,9 @@ from hashlib import md5 from io import StringIO from datetime import datetime as dt verbose = False debug = False trace = False demo_configuration = { 'columns': { Loading Loading @@ -155,16 +158,35 @@ def text_find_header(input_file: str): with open(input_file, "r") as f: # Get the lines. try: lns = f.readlines() except: print('failed') if verbose: try: lns = [] for l in f: print('l:',l) lns.append(l) print('----------------------') print(lns) print('----------------------') except: sys.exit(1) if verbose: n = len([l for l in lns if l.startswith('#')]) print('lines ignored:', n) # Get the number of lines. ln_count = len(lns) # Get one quarter tail of the file; make pseudo file. tail_io = StringIO("\n".join(lns[-1*int(ln_count/4):])) if trace: print(tail_io.readlines()) # Read with pandas and count the number of columns. tail_cols = len(list(pd.read_csv(tail_io))) #, skiprows=[0]))) # Loop until we find the header row. for i, ln in enumerate(lns): Loading Loading @@ -196,7 +218,6 @@ def text_find_header(input_file: str): return None def text_parser(file: str, metadata: bool=False, dropna: bool=True): """ A table parser for any input file that can be read as text. Loading @@ -212,7 +233,6 @@ def text_parser(file: str, metadata: bool=False, dropna: bool=True): an integer indicating the header row number (if hdrline is True). """ # Find the header row. header_row = text_find_header(file) Loading Loading @@ -412,16 +432,23 @@ def buildConfiguration(fields): if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-d', '--directory', dest='dir', type=str, help='directory containing csv files', required=True) parser.add_argument('-v', '--verbose', action='store_true', help='turn on additional printing') parser.add_argument('-b', '--debug', action='store_true', help='turn on additional printing for debugging') parser.add_argument('-t', '--trace', action='store_true', help='turn on much more printing for debugging') parser.add_argument('-g', '--config', dest='config', type=argparse.FileType('r'), default=demo_configuration, help='csv file with a single row containing items to retrieve from the files - Ex: site,plot,latitude,soil_depth') group = parser.add_mutually_exclusive_group(required=True) group.add_argument('-d', '--directory', dest='dir', type=str, help='directory containing csv files')#, required=True) group.add_argument('-f', '--file', dest='file', type=argparse.FileType('r'), help='csv file')#, required=True) args = parser.parse_args() if args.config != demo_configuration: fields = pd.read_csv(args.config) cfg = buildConfiguration(fields) else: cfg = args.config args.config = buildConfiguration(fields) verbose = args.verbose debug = args.debug trace = args.trace # if args.config != demo_configuration: # items = list(pd.read_csv(args.config).keys()) Loading @@ -429,20 +456,31 @@ if __name__ == "__main__": # items = None # config = args.config if os.path.isdir(args.dir): if args.dir: if not os.path.isdir(args.dir): print('Must be a valid directory') parser.print_usage() sys.exit(1) for (root, dirs, files) in os.walk(args.dir): paths = [ os.path.join(root, file) for file in files if file.endswith('csv') ] for path in paths: # Call main. output = main(path, config=cfg) output = main(path, config=args.config) # Dump the dict as a json to stdout. print(json.dumps(output, indent=2, cls=NumpyEncoder)) else: print('Must be a valid directory') parser.print_usage() elif args.file: output = main(args.file.name, config=args.config) print(json.dumps(output, indent=2, cls=NumpyEncoder)) else: print('something went wrong') sys.exit(1) # Exit demo. sys.exit() Loading Loading
ess-dive-meta.py 100644 → 100755 +53 −15 Original line number Diff line number Diff line Loading @@ -14,6 +14,9 @@ from hashlib import md5 from io import StringIO from datetime import datetime as dt verbose = False debug = False trace = False demo_configuration = { 'columns': { Loading Loading @@ -155,16 +158,35 @@ def text_find_header(input_file: str): with open(input_file, "r") as f: # Get the lines. try: lns = f.readlines() except: print('failed') if verbose: try: lns = [] for l in f: print('l:',l) lns.append(l) print('----------------------') print(lns) print('----------------------') except: sys.exit(1) if verbose: n = len([l for l in lns if l.startswith('#')]) print('lines ignored:', n) # Get the number of lines. ln_count = len(lns) # Get one quarter tail of the file; make pseudo file. tail_io = StringIO("\n".join(lns[-1*int(ln_count/4):])) if trace: print(tail_io.readlines()) # Read with pandas and count the number of columns. tail_cols = len(list(pd.read_csv(tail_io))) #, skiprows=[0]))) # Loop until we find the header row. for i, ln in enumerate(lns): Loading Loading @@ -196,7 +218,6 @@ def text_find_header(input_file: str): return None def text_parser(file: str, metadata: bool=False, dropna: bool=True): """ A table parser for any input file that can be read as text. Loading @@ -212,7 +233,6 @@ def text_parser(file: str, metadata: bool=False, dropna: bool=True): an integer indicating the header row number (if hdrline is True). """ # Find the header row. header_row = text_find_header(file) Loading Loading @@ -412,16 +432,23 @@ def buildConfiguration(fields): if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-d', '--directory', dest='dir', type=str, help='directory containing csv files', required=True) parser.add_argument('-v', '--verbose', action='store_true', help='turn on additional printing') parser.add_argument('-b', '--debug', action='store_true', help='turn on additional printing for debugging') parser.add_argument('-t', '--trace', action='store_true', help='turn on much more printing for debugging') parser.add_argument('-g', '--config', dest='config', type=argparse.FileType('r'), default=demo_configuration, help='csv file with a single row containing items to retrieve from the files - Ex: site,plot,latitude,soil_depth') group = parser.add_mutually_exclusive_group(required=True) group.add_argument('-d', '--directory', dest='dir', type=str, help='directory containing csv files')#, required=True) group.add_argument('-f', '--file', dest='file', type=argparse.FileType('r'), help='csv file')#, required=True) args = parser.parse_args() if args.config != demo_configuration: fields = pd.read_csv(args.config) cfg = buildConfiguration(fields) else: cfg = args.config args.config = buildConfiguration(fields) verbose = args.verbose debug = args.debug trace = args.trace # if args.config != demo_configuration: # items = list(pd.read_csv(args.config).keys()) Loading @@ -429,20 +456,31 @@ if __name__ == "__main__": # items = None # config = args.config if os.path.isdir(args.dir): if args.dir: if not os.path.isdir(args.dir): print('Must be a valid directory') parser.print_usage() sys.exit(1) for (root, dirs, files) in os.walk(args.dir): paths = [ os.path.join(root, file) for file in files if file.endswith('csv') ] for path in paths: # Call main. output = main(path, config=cfg) output = main(path, config=args.config) # Dump the dict as a json to stdout. print(json.dumps(output, indent=2, cls=NumpyEncoder)) else: print('Must be a valid directory') parser.print_usage() elif args.file: output = main(args.file.name, config=args.config) print(json.dumps(output, indent=2, cls=NumpyEncoder)) else: print('something went wrong') sys.exit(1) # Exit demo. sys.exit() Loading