added support for distributed dataframes from athena queries

2023-10-10 21:21:22 -04:00 · 2023-10-10 21:21:22 -04:00 · bfb5dda6d9
commit bfb5dda6d9
parent 27a1d75bb3
3 changed files with 30 additions and 8 deletions
--- a/minerva/athena.py
+++ b/minerva/athena.py
@ -6,6 +6,7 @@ import pyarrow as pa
 import pyarrow.dataset
 import pprint
 import datetime
+import dask.dataframe as dd
 from minerva import parallel_map

 pp = pprint.PrettyPrinter(indent=4)
@ -16,12 +17,14 @@ class Athena:
        self.client  = handler.session.client("athena")
        self.output  = output

+
    # For when you want to receive the results of something
    def query(self, sql, params=[]):
        q = Query(self, sql, params)
        q.run()
        return q

+
    # For when you want to send a query to run, but there aren't results
    # (like a DML query for creating databases and tables etc)
    def execute(self, sql, params=[]):
@ -29,6 +32,7 @@ class Athena:
        e.run()
        return e

+
 class Execute:
    """
    Execute will not return results, but will execute the SQL and return the final state.
@ -44,10 +48,12 @@ class Execute:
        self.temps   = []
        self.ds      = None

+
    # The string of the query
    def query(self):
        return self.sql

+
    # Send the SQL to Athena for running
    def run(self):
        if self.__class__ == Query:
@ -66,16 +72,19 @@ class Execute:
        self.query_id = resp['QueryExecutionId']
        return resp

+
    # The status of the SQL (running, queued, succeeded, etc.)
    def status(self):
        return self.info()['Status']['State']

+
    # Get the basic information on the SQL
    def info(self):
        res = self.client.get_query_execution(QueryExecutionId=self.query_id)
        self.info_cache = res['QueryExecution']
        return self.info_cache

+
    # Block until the SQL has finished running
    def finish(self):
        stat = self.status()
@ -104,6 +113,7 @@ class Query(Execute):
                 f"with (format = '{self.DATA_STYLE}')"
        return query

+
    # Gets the files that are listed in the manifest (from the UNLOAD part of
    # the statement)
    # Blocks until the query has finished (because it calls `self.finish()`)
@ -128,6 +138,7 @@ class Query(Execute):
            raise Exception(self.info_cache['Status']['AthenaError']['ErrorMessage'])
            #return status # canceled or error

+
    # Blocks until the query has finished running and then returns you a pyarrow
    # dataset of the results.
    # Calls `self.manifest_files()` which blocks via `self.finish()`
@ -140,17 +151,34 @@ class Query(Execute):
        self.ds = pa.dataset.dataset(self.temps)
        return self.ds

+
+    def distribute_results(self, client):
+        if self.ds:
+            return self.ds
+
+        futures = []
+        for fn in self.manifest_files():
+            df = pd.read_csv(fn)
+            future = client.scatter(df)
+            futures.append(future)
+
+        return dd.from_delayed(futures, meta=df)
+
+
    # Return scalar results
    # Abstracts away a bunch of keystrokes
    def scalar(self):
        return self.results().head(1)[0][0].as_py()

+
    def __enter__(self):
        return self

+
    def __exit__(self, exception_type, exception_value, exception_traceback):
        self.close()

+
    def close(self):
        if self.temps:
            for file in self.temps:
--- a/minerva/cluster.py
+++ b/minerva/cluster.py
@ -40,9 +40,6 @@ class Cluster:
        self.create()
        self.login()
        self.start_dask()
-        #self.connect()
-
-        #return self.client


    # Begin the startup process in the background
@ -69,11 +66,6 @@ class Cluster:
            w.cmd(f"dask worker {self.scheduler.private_ip}:8786", disown=True)


-    def connect(self):
-        self.client = Client(self.location)
-        return self.client
-
-
    def terminate(self):
        self.scheduler.terminate()
        for w in self.workers:
--- a/minerva/machine.py
+++ b/minerva/machine.py
@ -75,6 +75,8 @@ class Machine:
        self.thread.join()

    def wait(self, n):
+        time.sleep(n) # give time for AWS to register that the instance has been created
+
        i = 0
        # Time for the server to show as "running"
        # and time for the server to finish getting daemons running