restructuring as a library with athena and redshift

This commit is contained in:
Ari Brown 2023-08-01 13:19:57 -04:00
parent adf909608d
commit c32f8359e7
8 changed files with 18 additions and 8 deletions

0
minerva/__init__.py Normal file
View file

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -6,6 +6,7 @@ import pyarrow as pa
import pyarrow.dataset import pyarrow.dataset
import pprint import pprint
import json import json
import datetime
pp = pprint.PrettyPrinter(indent=4) pp = pprint.PrettyPrinter(indent=4)
@ -79,6 +80,8 @@ class Query:
local = [self.handler.download(f) for f in files] local = [self.handler.download(f) for f in files]
self.ds = pa.dataset.dataset(local) self.ds = pa.dataset.dataset(local)
self.runtime = tiedot['UpdatedAt'] - tiedot['CreatedAt']
return self.ds return self.ds
else: else:
print("Error:") print("Error:")

View file

@ -5,6 +5,7 @@ import time
import pyarrow as pa import pyarrow as pa
import pyarrow.dataset import pyarrow.dataset
import pprint import pprint
import datetime
pp = pprint.PrettyPrinter(indent=4) pp = pprint.PrettyPrinter(indent=4)
@ -60,7 +61,8 @@ class Query:
while status in ['QUEUED', 'RUNNING']: while status in ['QUEUED', 'RUNNING']:
time.sleep(5) time.sleep(5)
status = self.status()['State'] tiedot = self.info()
status = tiedot['Status']['State']
if status == "SUCCEEDED": if status == "SUCCEEDED":
# Because we're using `UNLOAD`, we get a manifest of the files # Because we're using `UNLOAD`, we get a manifest of the files
@ -72,6 +74,9 @@ class Query:
local = [self.handler.download(f) for f in files] local = [self.handler.download(f) for f in files]
self.ds = pa.dataset.dataset(local) self.ds = pa.dataset.dataset(local)
ms = tiedot['Statistics']['TotalExecutionTimeInMillis']
self.runtime = datetime.timedelta(seconds=ms / 1000)
return self.ds return self.ds
else: else:
return status # canceled or error return status # canceled or error

View file

@ -1,9 +1,9 @@
import minerva as m import minerva.minerva as a
import pprint import pprint
pp = pprint.PrettyPrinter(indent=4) pp = pprint.PrettyPrinter(indent=4)
athena = m.Athena("hay", "s3://haystac-pmo-athena/") athena = a.Athena("hay", "s3://haystac-pmo-athena/")
#query = athena.query( #query = athena.query(
#"""SELECT * #"""SELECT *
#FROM trajectories.kitware #FROM trajectories.kitware
@ -14,7 +14,8 @@ athena = m.Athena("hay", "s3://haystac-pmo-athena/")
#""") #""")
query = athena.query("select count(*) as count from trajectories.kitware") query = athena.query("select count(*) as count from trajectories.kitware")
data = query.results() data = query.results()
pp.pprint(query.info()['Statistics']) pp.pprint(data.head(10))
print(query.runtime)
# Everything *needs* to have a column in order for parquet to work, so scalar # Everything *needs* to have a column in order for parquet to work, so scalar
# values have to be assigned something, so here we use `as count` to create # values have to be assigned something, so here we use `as count` to create

View file

@ -1,4 +1,4 @@
import blueshift as b import minerva.blueshift as b
import pprint import pprint
pp = pprint.PrettyPrinter(indent=4) pp = pprint.PrettyPrinter(indent=4)
@ -7,7 +7,8 @@ red = b.Redshift("hay", "s3://haystac-pmo-athena/",
db="dev", db="dev",
cluster="redshift-cluster-1") cluster="redshift-cluster-1")
query = red.query("select count(*) from myspectrum_schema.kitware") query = red.query("select count(*) from myspectrum_schema.kitware")
res = query.results() print(query)
pp.pprint(res.head(10)) data = query.results()
pp.pprint(query.info()) pp.pprint(data.head(10))
print(query.runtime)