added machine pool for easy clustering and trying to get glue support

2024-05-29 16:23:21 -04:00 · 2024-05-29 16:23:21 -04:00 · fdc0fd2ded
commit fdc0fd2ded
parent c2bc9e1028
3 changed files with 71 additions and 20 deletions
--- a/minerva/init.py
+++ b/minerva/init.py
@ -7,6 +7,7 @@ from .s3       import S3
 from .docker   import Docker
 from .machine  import Machine
 from .pier     import Pier
+from .pool     import Pool

 from .minerva  import Minerva

@ -22,6 +23,7 @@ __all__ = [
    "cluster_pool",
    "load_template",
    "load_sql",
-    "AWS_INSTANCES"
+    "AWS_INSTANCES",
+    "Pool"
 ]

--- a/minerva/athena.py
+++ b/minerva/athena.py
@ -67,24 +67,6 @@ class Athena:
        return e


-    # FIXME bad sql, can't drop multiple tables in athena
-    def delete_tables(self, db_name, tables):
-        e = Execute(self, f"drop table {', '.join(tables)}")
-        e.run()
-        e.finish()
-
-        try:
-            self.glue.batch_delete_table(DatabaseName   = db_name,
-                                         TablesToDelete = tables)
-        finally:
-            pass
-
-        for table in tables:
-            s3_uri = os.path.join(self.output, table, "")
-            #print(f"deleting {s3_uri}")
-            self.handler.s3.rm(s3_uri)
-
-
    def cancel(self, query_id):
        return self.client.stop_query_execution(QueryExecutionId = query_id)

@ -106,7 +88,11 @@ class Athena:
        e.run()
        e.finish()

-        # 2. In chunks
+        # 2. Run the Glue ETL job
+        return self.glue.start_job_run(
+                    JobName   = 'convert table to delta',
+                    Arguments = {'--from': table,
+                                 '--to':   to})


    def describe_columns(self, table):
--- a/minerva/pool.py
+++ b/minerva/pool.py
@ -0,0 +1,63 @@
+from threading import Thread, Lock
+
+class Pool:
+    def __init__(self, worker, num=1):
+        # TODO can move the creation into a thread, but that might be too
+        # many concurrent requests for AWS
+        self.machines = [worker(i).create() for i in range(num)]
+        self.mutex    = None
+
+        for machine in self.machines:
+            machine.join()
+            machine.login()
+
+    def run(func, data=[]):
+        if not data or not func:
+            return
+
+        # We'll be modifying this, don't mess with the original
+        self.mutex = Lock()
+        data       = data.copy()
+
+        # All threads are sharing the same `data` and access is controlled by a mutex
+        threads = [Thread(target=self.process_queue, args=(machine, func, data))
+                   for machine in self.machines]
+
+        # Start the threads
+        for thread in threads:
+            thread.start()
+
+        # Wait for the workers to finish
+        # TODO maybe return STDOUT from everything?
+        for thread in threads:
+            thread.join()
+
+
+    def process_queue(self, machine, func, data):
+        self.mutex.acquire()
+        while data:
+            item = data.pop()
+            if type(item) == type([]):
+                print(f"i'm doing work with [{min(item)}..{max(item)}] on {machine}")
+            else:
+                print(f"i'm doing work with {item} on {machine}")
+            self.mutex.release()
+
+            # do the work
+            func(machine, item)
+            #time.sleep(0.5)
+
+            # prior to return to the while-loop check
+            self.mutex.acquire()
+
+        self.mutex.release() # we're done!
+
+
+    def terminate(self):
+        for mach in self.machines:
+            mach.terminate()
+
+
+    def cost(self):
+        return sum([mach.cost() for mach in machines])
+