adding dask examples

2023-11-16 10:33:56 -05:00 · 2023-11-16 10:33:56 -05:00 · fe06b6b808
commit fe06b6b808
parent c0ff6af866
6 changed files with 149 additions and 18 deletions
--- a/minerva/athena.py
+++ b/minerva/athena.py
@ -19,16 +19,16 @@ class Athena:


    # For when you want to receive the results of something
-    def query(self, sql, params=[]):
-        q = Query(self, sql, params)
+    def query(self, sql, params=[], format='parquet'):
+        q = Query(self, sql, params, format)
        q.run()
        return q


    # For when you want to send a query to run, but there aren't results
    # (like a DML query for creating databases and tables etc)
-    def execute(self, sql, params=[]):
-        e = Execute(self, sql, params)
+    def execute(self, sql, params=[], format=None):
+        e = Execute(self, sql, params, format)
        e.run()
        return e

@ -38,7 +38,7 @@ class Execute:
    Execute will not return results, but will execute the SQL and return the final state.
    Execute is meant to be used for DML statements such as CREATE DATABASE/TABLE
    """
-    def __init__(self, athena, sql, params=[]):
+    def __init__(self, athena, sql, params=[], format='parquet'):
        self.athena  = athena
        self.handler = athena.handler
        self.client  = athena.client
@ -47,6 +47,8 @@ class Execute:
        self.info_cache = None
        self.temps   = []
        self.ds      = None
+        self.files   = None
+        self.format  = format


    # The string of the query
@ -56,10 +58,7 @@ class Execute:

    # Send the SQL to Athena for running
    def run(self):
-        if self.__class__ == Query:
-            config = {"OutputLocation": os.path.join(self.athena.output, "results")}
-        else:
-            config = {"OutputLocation": self.athena.output}
+        config = {"OutputLocation": os.path.join(self.athena.output, "results")}

        if self.params:
            resp  = self.client.start_query_execution(QueryString=self.query(),
@ -102,7 +101,6 @@ class Execute:


 class Query(Execute):
-    DATA_STYLE = 'parquet'

    # Automatically includes unloading the results to Parquet format
    def query(self):
@ -110,7 +108,7 @@ class Query(Execute):
                              "results",
                              str(random.random()))
        query  = f"unload ({self.sql}) to {repr(out)} " + \
-                 f"with (format = '{self.DATA_STYLE}')"
+                 f"with (format = '{self.format}')"
        return query


@ -118,19 +116,19 @@ class Query(Execute):
    # the statement)
    # Blocks until the query has finished (because it calls `self.finish()`)
    def manifest_files(self):
+        if self.files:
+            return self.files
+
        status = self.finish()

        if status == "SUCCEEDED":
-            # Track the runtime
-            ms = self.info_cache['Statistics']['TotalExecutionTimeInMillis']
-            self.runtime = datetime.timedelta(seconds=ms / 1000)
-
            # Because we're using `UNLOAD`, we get a manifest of the files
            # that make up our data.
            manif = self.info_cache['Statistics']['DataManifestLocation']
            files = self.handler.s3.read(manif).split("\n")
            files = [f.strip() for f in files if f.strip()] # filter empty

+            self.files = files
            return files
        else:
            print("Error")
@ -152,7 +150,7 @@ class Query(Execute):
        return self.ds


-    def distribute_results(self, client):
+    def distribute_results(self, client, size=10000):
        import dask.dataframe as dd
        import pandas as pd

@ -163,7 +161,8 @@ class Query(Execute):
        print(f"{len(self.manifest_files())} files in manifest")
        for fn in self.manifest_files():
            print(f"reading {fn}...")
-            df = pd.read_parquet(fn)
+            df = dd.from_pandas(pd.read_parquet(fn), chunksize=100000)
+            print(df._meta)
            print("\tloaded")
            future = client.scatter(df)
            print("\tscattered")
--- a/minerva/machine.py
+++ b/minerva/machine.py
@ -93,7 +93,7 @@ class Machine:

        # Final wait, now that the server is up and running -- need
        # some time for daemons to start
-        time.sleep(25)
+        time.sleep(35)
        self.ready = True