adding dask examples

This commit is contained in:
Ari Brown 2023-11-16 10:33:56 -05:00
parent c0ff6af866
commit fe06b6b808
6 changed files with 149 additions and 18 deletions

View file

@ -19,16 +19,16 @@ class Athena:
# For when you want to receive the results of something
def query(self, sql, params=[]):
q = Query(self, sql, params)
def query(self, sql, params=[], format='parquet'):
q = Query(self, sql, params, format)
q.run()
return q
# For when you want to send a query to run, but there aren't results
# (like a DML query for creating databases and tables etc)
def execute(self, sql, params=[]):
e = Execute(self, sql, params)
def execute(self, sql, params=[], format=None):
e = Execute(self, sql, params, format)
e.run()
return e
@ -38,7 +38,7 @@ class Execute:
Execute will not return results, but will execute the SQL and return the final state.
Execute is meant to be used for DML statements such as CREATE DATABASE/TABLE
"""
def __init__(self, athena, sql, params=[]):
def __init__(self, athena, sql, params=[], format='parquet'):
self.athena = athena
self.handler = athena.handler
self.client = athena.client
@ -47,6 +47,8 @@ class Execute:
self.info_cache = None
self.temps = []
self.ds = None
self.files = None
self.format = format
# The string of the query
@ -56,10 +58,7 @@ class Execute:
# Send the SQL to Athena for running
def run(self):
if self.__class__ == Query:
config = {"OutputLocation": os.path.join(self.athena.output, "results")}
else:
config = {"OutputLocation": self.athena.output}
config = {"OutputLocation": os.path.join(self.athena.output, "results")}
if self.params:
resp = self.client.start_query_execution(QueryString=self.query(),
@ -102,7 +101,6 @@ class Execute:
class Query(Execute):
DATA_STYLE = 'parquet'
# Automatically includes unloading the results to Parquet format
def query(self):
@ -110,7 +108,7 @@ class Query(Execute):
"results",
str(random.random()))
query = f"unload ({self.sql}) to {repr(out)} " + \
f"with (format = '{self.DATA_STYLE}')"
f"with (format = '{self.format}')"
return query
@ -118,19 +116,19 @@ class Query(Execute):
# the statement)
# Blocks until the query has finished (because it calls `self.finish()`)
def manifest_files(self):
if self.files:
return self.files
status = self.finish()
if status == "SUCCEEDED":
# Track the runtime
ms = self.info_cache['Statistics']['TotalExecutionTimeInMillis']
self.runtime = datetime.timedelta(seconds=ms / 1000)
# Because we're using `UNLOAD`, we get a manifest of the files
# that make up our data.
manif = self.info_cache['Statistics']['DataManifestLocation']
files = self.handler.s3.read(manif).split("\n")
files = [f.strip() for f in files if f.strip()] # filter empty
self.files = files
return files
else:
print("Error")
@ -152,7 +150,7 @@ class Query(Execute):
return self.ds
def distribute_results(self, client):
def distribute_results(self, client, size=10000):
import dask.dataframe as dd
import pandas as pd
@ -163,7 +161,8 @@ class Query(Execute):
print(f"{len(self.manifest_files())} files in manifest")
for fn in self.manifest_files():
print(f"reading {fn}...")
df = pd.read_parquet(fn)
df = dd.from_pandas(pd.read_parquet(fn), chunksize=100000)
print(df._meta)
print("\tloaded")
future = client.scatter(df)
print("\tscattered")

View file

@ -93,7 +93,7 @@ class Machine:
# Final wait, now that the server is up and running -- need
# some time for daemons to start
time.sleep(25)
time.sleep(35)
self.ready = True