added machine pool for easy clustering and trying to get glue support

This commit is contained in:
Ari Brown 2024-05-29 16:23:21 -04:00
parent c2bc9e1028
commit fdc0fd2ded
3 changed files with 71 additions and 20 deletions

View file

@ -7,6 +7,7 @@ from .s3 import S3
from .docker import Docker from .docker import Docker
from .machine import Machine from .machine import Machine
from .pier import Pier from .pier import Pier
from .pool import Pool
from .minerva import Minerva from .minerva import Minerva
@ -22,6 +23,7 @@ __all__ = [
"cluster_pool", "cluster_pool",
"load_template", "load_template",
"load_sql", "load_sql",
"AWS_INSTANCES" "AWS_INSTANCES",
"Pool"
] ]

View file

@ -67,24 +67,6 @@ class Athena:
return e return e
# FIXME bad sql, can't drop multiple tables in athena
def delete_tables(self, db_name, tables):
e = Execute(self, f"drop table {', '.join(tables)}")
e.run()
e.finish()
try:
self.glue.batch_delete_table(DatabaseName = db_name,
TablesToDelete = tables)
finally:
pass
for table in tables:
s3_uri = os.path.join(self.output, table, "")
#print(f"deleting {s3_uri}")
self.handler.s3.rm(s3_uri)
def cancel(self, query_id): def cancel(self, query_id):
return self.client.stop_query_execution(QueryExecutionId = query_id) return self.client.stop_query_execution(QueryExecutionId = query_id)
@ -106,7 +88,11 @@ class Athena:
e.run() e.run()
e.finish() e.finish()
# 2. In chunks # 2. Run the Glue ETL job
return self.glue.start_job_run(
JobName = 'convert table to delta',
Arguments = {'--from': table,
'--to': to})
def describe_columns(self, table): def describe_columns(self, table):

63
minerva/pool.py Normal file
View file

@ -0,0 +1,63 @@
from threading import Thread, Lock
class Pool:
def __init__(self, worker, num=1):
# TODO can move the creation into a thread, but that might be too
# many concurrent requests for AWS
self.machines = [worker(i).create() for i in range(num)]
self.mutex = None
for machine in self.machines:
machine.join()
machine.login()
def run(func, data=[]):
if not data or not func:
return
# We'll be modifying this, don't mess with the original
self.mutex = Lock()
data = data.copy()
# All threads are sharing the same `data` and access is controlled by a mutex
threads = [Thread(target=self.process_queue, args=(machine, func, data))
for machine in self.machines]
# Start the threads
for thread in threads:
thread.start()
# Wait for the workers to finish
# TODO maybe return STDOUT from everything?
for thread in threads:
thread.join()
def process_queue(self, machine, func, data):
self.mutex.acquire()
while data:
item = data.pop()
if type(item) == type([]):
print(f"i'm doing work with [{min(item)}..{max(item)}] on {machine}")
else:
print(f"i'm doing work with {item} on {machine}")
self.mutex.release()
# do the work
func(machine, item)
#time.sleep(0.5)
# prior to return to the while-loop check
self.mutex.acquire()
self.mutex.release() # we're done!
def terminate(self):
for mach in self.machines:
mach.terminate()
def cost(self):
return sum([mach.cost() for mach in machines])