Commit 3dab43bb authored by Jacob's avatar Jacob
Browse files

Added scrapper to code base

parent 433b0d97
Loading
Loading
Loading
Loading
+14 −0
Original line number Diff line number Diff line
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import warnings
try:
    import bs4
    import requests
    import selenium
    import urllib3
    import cfscrape
    import cloudscraper
    import websocket
except ImportError as e:
    missing_module = str(e).split("No module named ")[-1].replace("'", "")
    warnings.warn(f'You should install the extra "scrapers", missing required module: {missing_module}.', UserWarning)
+17 −0
Original line number Diff line number Diff line
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Provide connection methods such as Postgres or sqlalchemy engine."""
import logging
from etl import ce


def create_uri(logger=logging.getLogger(ce('ETL_LOGGER', 'main'))):
    """Create a URI for a connection to the Postgresql database."""
    user = ce('DB_USER', 'guest')
    pwd = ce('DB_PASS', 'abc123')
    host = ce('DB_HOST', 'db')
    db = ce('DB_DB', 'covidb')
    port = ce('DB_PORT', '5432')
    uri = f'postgres://{user}:{pwd}@{host}:{port}/{db}'
    logger.info(f'Created URI: {uri}')
    return uri
+49 −0
Original line number Diff line number Diff line
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from pathlib import Path
"""Provide generics for dealing with csvs."""
import os
import pandas as pd
from common.logz import create_logger


def get_csv(csv, **kwargs):
    """Retrieve a CSV with standard defaults using Pandas.
       read in a pandas csv with optional arguments
       :param csv: the csv to import
       :returns: pandas.DataFrame
    """
    logger = create_logger()
    if not os.path.exists(csv):
        logger.error(f'csv file at {csv} does not exist.')
        pass
    return pd.read_csv(csv, na_values=[' ', '', 'NA', '<NA>'],
                       keep_default_na=True, parse_dates=['updated',
                       'access_time'],
                       infer_datetime_format=True, encoding='utf_8',
                       error_bad_lines=False, **kwargs)


def glob_csvs(directory, logger=create_logger()):
    """Globs for all CSVs in a directory."""
    dir_path = Path(directory)

    if not dir_path.exists():
        logger.warning(f'{directory} does not exist')
    elif not dir_path.is_dir():
        logger.warning(f'{directory} is not a directory')
    else:
        logger.info(f'Looking for CSVs in {directory}.')
        csvs = dir_path.glob('*.csv')

        csv_strings = [str(x) for x in csvs]

        if csv_strings == []:
            logger.warning(f'No CSV files found in {directory}.')
            return []

        logger.info(f'Found {len(csv_strings)} CSV files.')
        return csv_strings

    logger.warning(f'No CSV files found in {directory}.')
    return []
+16 −0
Original line number Diff line number Diff line
# Request mix-in classes

This directory contains mix-in classes to support the common types of web sites
for ETL scraping.

* HTML supported by `html.py`
* JSON supported by `json.py`
* XML supported by `xml.py`
* (soon selenium, but as a sub-class in parent directory)

The idea is that each of these classes provides a `request()` member function
that can be slotted into a `BaseScraper` sub-class such that it provides a
service to connect to a remote host, grab its data, and package it in a 
consistent way.  Usually the package will be a BeautifulSoup object, but for
JSON sites, we just return the JSON object for parsing.
+3 −0
Original line number Diff line number Diff line
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Allow imports of python modules from this directory."""
Loading