Added scrapper to code base (3dab43bb) · Commits · GSHS Utilities / common-package

src/common/scrapers/init.py

0 → 100755

+14 −0

Original line number	Diff line number	Diff line
		#!/usr/bin/env python3
		# -- coding: utf-8 --
		import warnings
		try:
		import bs4
		import requests
		import selenium
		import urllib3
		import cfscrape
		import cloudscraper
		import websocket
		except ImportError as e:
		missing_module = str(e).split("No module named ")[-1].replace("'", "")
		warnings.warn(f'You should install the extra "scrapers", missing required module: {missing_module}.', UserWarning)

0 → 100755

+17 −0

Original line number	Diff line number	Diff line
		#!/usr/bin/env python3
		# -- coding: utf-8 --
		"""Provide connection methods such as Postgres or sqlalchemy engine."""
		import logging
		from etl import ce


		def create_uri(logger=logging.getLogger(ce('ETL_LOGGER', 'main'))):
		"""Create a URI for a connection to the Postgresql database."""
		user = ce('DB_USER', 'guest')
		pwd = ce('DB_PASS', 'abc123')
		host = ce('DB_HOST', 'db')
		db = ce('DB_DB', 'covidb')
		port = ce('DB_PORT', '5432')
		uri = f'postgres://{user}:{pwd}@{host}:{port}/{db}'
		logger.info(f'Created URI: {uri}')
		return uri

0 → 100755

+49 −0

Original line number	Diff line number	Diff line
		#!/usr/bin/env python3
		# -- coding: utf-8 --
		from pathlib import Path
		"""Provide generics for dealing with csvs."""
		import os
		import pandas as pd
		from common.logz import create_logger


		def get_csv(csv, **kwargs):
		"""Retrieve a CSV with standard defaults using Pandas.
		read in a pandas csv with optional arguments
		:param csv: the csv to import
		:returns: pandas.DataFrame
		"""
		logger = create_logger()
		if not os.path.exists(csv):
		logger.error(f'csv file at {csv} does not exist.')
		pass
		return pd.read_csv(csv, na_values=[' ', '', 'NA', '<NA>'],
		keep_default_na=True, parse_dates=['updated',
		'access_time'],
		infer_datetime_format=True, encoding='utf_8',
		error_bad_lines=False, **kwargs)


		def glob_csvs(directory, logger=create_logger()):
		"""Globs for all CSVs in a directory."""
		dir_path = Path(directory)

		if not dir_path.exists():
		logger.warning(f'{directory} does not exist')
		elif not dir_path.is_dir():
		logger.warning(f'{directory} is not a directory')
		else:
		logger.info(f'Looking for CSVs in {directory}.')
		csvs = dir_path.glob('*.csv')

		csv_strings = [str(x) for x in csvs]

		if csv_strings == []:
		logger.warning(f'No CSV files found in {directory}.')
		return []

		logger.info(f'Found {len(csv_strings)} CSV files.')
		return csv_strings

		logger.warning(f'No CSV files found in {directory}.')
		return []

0 → 100644

+16 −0

Original line number	Diff line number	Diff line
		# Request mix-in classes

		This directory contains mix-in classes to support the common types of web sites
		for ETL scraping.

		* HTML supported by `html.py`
		* JSON supported by `json.py`
		* XML supported by `xml.py`
		* (soon selenium, but as a sub-class in parent directory)

		The idea is that each of these classes provides a `request()` member function
		that can be slotted into a `BaseScraper` sub-class such that it provides a
		service to connect to a remote host, grab its data, and package it in a
		consistent way. Usually the package will be a BeautifulSoup object, but for
		JSON sites, we just return the JSON object for parsing.

0 → 100644

+3 −0

Original line number	Diff line number	Diff line
		#!/usr/bin/env python3
		# -- coding: utf-8 --
		"""Allow imports of python modules from this directory."""