Commit 303bc087 authored by Price, Zach's avatar Price, Zach
Browse files

Add dataframe functionality

parent 9c8a7388
Loading
Loading
Loading
Loading
Loading
+5 −0
Original line number Diff line number Diff line
@@ -20,3 +20,8 @@ async def startup() -> None:
async def shutdown() -> None:
    if database.is_connected:
        await database.disconnect()


@app.get('/healthz')
async def health_and_liveness_probe():
    return {'status': 'healthy'}
+4 −0
Original line number Diff line number Diff line
from datetime import datetime

import databases
import sqlalchemy
from ormar import Integer, Model, String
@@ -48,6 +50,8 @@ class Files(ARMPath):
    datastream: str = String(max_length=30)
    md5_checksum: str = String(max_length=32, min_length=32, regex=r'[a-fA-F\d]+')
    type: str = "file"
    start_time: datetime
    end_time: datetime

    @property_field
    def name(self) -> str:
+19 −11
Original line number Diff line number Diff line
from datetime import datetime
from typing import List

from fastapi.routing import APIRouter
@@ -9,29 +10,24 @@ from .models import Datastreams, Files, Sites
router = APIRouter(route_class=versioned_api_route(1))


@router.get('/healthz')
async def health_and_liveness_probe():
    return {'status': 'healthy'}


@router.get("/fs/", response_model=List[Sites])
async def list_sites():
    # if config.debug:
    #     print(Sites.objects.build_select_expression())
    if config.debug:
        print(Sites.objects.build_select_expression())
    return await Sites.objects.all()


@router.get("/fs/{site}", response_model=List[Datastreams])
async def list_datastreams_for_site(site: Sites.site_code):
    # if config.debug:
    #     print(Datastreams.objects.build_select_expression())
    if config.debug:
        print(Datastreams.objects.build_select_expression())
    return await Datastreams.objects.filter(site_code=site).all()


@router.get('/fs/{site}/{datastream}', response_model=List[Files])
async def list_files_for_datastream(site: Sites.site_code, datastream: Datastreams.datastream):
    # if config.debug:
    #     print(Files.objects.build_select_expression())
    if config.debug:
        print(Files.objects.build_select_expression())
    return await Files.objects.filter(datastream=datastream).all()


@@ -41,3 +37,15 @@ async def stat_file(site: Sites.site_code, datastream: Datastreams.datastream, f
        print(Files.objects.filter(versioned_filename=filename).build_select_expression())

    return await Files.objects.filter(versioned_filename=filename).get()


@router.get('/search', response_model=List[Files])
async def search(datastream: Files.datastream, start: datetime, end: datetime):
    results = Files.objects.filter(start_time__gte=start, end_time__lte=end)
    if datastream:
        results = results.filter(datastream=datastream)

    if config.debug:
        print(results.build_select_expression())

    return await results.get()

ADL/client/archive.py

0 → 100644
+53 −0
Original line number Diff line number Diff line
import json
import re
from datetime import datetime

import pandas as pd
import xarray as xr

ATTR_REGEX = r'^[a-z]{3}\w*[A-Z][0-9]*\.[a-z\d]\d:att:(\S+)\s+val\s+(.+)$'
SITE_REGEX = r'^[a-z]{3}$'
DATASTREAM_REGEX = r'^([a-z]{3})\w*([A-Z][0-9]*)\.([a-z\d]\d)$'


class Archive(object):
    def __init__(self, username, password):
        self._selected_sites = None
        self._selected_datastreams = None
        self._selected_time_start = datetime.mix
        self._selected_time_stop = datetime.max
        self.attributes = {}
        self.token = None

    def __getitem__(self, key):
        if isinstance(key, str):
            if re.match(SITE_REGEX, key):
                self._selected_sites = set({key})
            if re.match(DATASTREAM_REGEX, key):
                self._selected_datastreams = set({key})
        elif isinstance(key, slice):
            if key.start:
                self._selected_time_start = pd.to_datetime(key.start)
            if key.stop:
                self._selected_time_stop = pd.to_datetime(key.stop)
        return self

    @property
    def file_count(self):
        if not self._selected_datastreams:
            return 29796186
        if self._selected_time_start:
            return (self._selected_time_stop - self._selected_time_start).days

    @property
    def file_list(self):
        return (
            f'https://meta-archive.arm.gov/sgp/sgpaerioe1turnC1.c1/sgpaerioe1turnC1.c1.{file_date.strftime("%Y%m%d")}.0000.nc.v0' for file_date in pd.date_range(self._selected_time_start, self._selected_time_stop)
        )

    def to_xarray(self):
        return xr.open_mfdataset(self.file_list)

    def __repr__(self):
        attr_list = ', '.join(f'{attr}: {getattr(self, attr)}' for attr in ('_selected_time_start', '_selected_time_stop', '_selected_sites', '_selected_datastreams'))
        return f'Archive({attr_list})'
+1 −0
Original line number Diff line number Diff line
@@ -6,6 +6,7 @@ from . import config


class ARMFs(HTTPFileSystem):
    protocol = 'arm'

    def __init__(self, *args, **storage_options):

Loading