significant improvement to the readme and verification that all the examples work

This commit is contained in:
Ari Brown 2024-01-31 16:18:32 -05:00
parent e3c11fb1aa
commit 5dccce53e9
9 changed files with 275 additions and 109 deletions

View file

@ -43,14 +43,15 @@ class Execute:
Execute is meant to be used for DML statements such as CREATE DATABASE/TABLE
"""
def __init__(self, redshift, sql):
self.redshift = redshift
self.handler = redshift.handler
self.client = redshift.client
self.sql = sql
self.info_cache = None
self.ds = None
self.files = None
self.temps = []
self.redshift = redshift
self.handler = redshift.handler
self.client = redshift.client
self.sql = sql
self.info_cache = None
self.status_cache = None
self.ds = None
self.files = None
self.temps = []
def query(self):
return self.sql
@ -58,9 +59,9 @@ class Execute:
def run(self):
if self.redshift.cluster:
resp = self.client.execute_statement(Sql=self.query(),
Database=self.redshift.database,
ClusterIdentifier=self.redshift.cluster)
resp = self.client.execute_statement(Sql = self.query(),
Database = self.redshift.database,
ClusterIdentifier = self.redshift.cluster)
else:
params = {"WorkgroupName": self.redshift.workgroup}
if self.redshift.secret:
@ -79,8 +80,13 @@ class Execute:
def info(self):
if self.status_cache in ['FINISHED', 'ABORTED', 'FAILED']:
return self.info_cache
res = self.client.describe_statement(Id=self.query_id)
self.info_cache = res
self.info_cache = res
self.status_cache = res['Status']
return self.info_cache
@ -94,7 +100,8 @@ class Execute:
self.runtime = self.info_cache['UpdatedAt'] - self.info_cache['CreatedAt']
if self.redshift.rpus:
self.cost = 0.36 * self.redshift.rpus * self.runtime.seconds / 3600.0 # $0.36 / RPU-hour
# $0.36 / RPU-hour
self.cost = 0.36 * self.redshift.rpus * self.runtime.seconds / 3600.0
return stat # finalized state
@ -104,8 +111,8 @@ class Query(Execute):
def query(self):
self.out = os.path.join(self.redshift.output,
str(random.random()),
'')
"results",
str(random.random()) + ".")
#query = f"unload ({repr(self.sql)}) to {repr(self.out)} " + \
# f"iam_role default " + \
# f"format as {self.DATA_STYLE} " + \
@ -119,10 +126,14 @@ format as {self.DATA_STYLE}
manifest;
drop table temp_data;
"""
print(query)
return query
def manifest_files(self):
if self.files:
return self.files
status = self.finish()
if status == "FINISHED":
@ -137,14 +148,19 @@ drop table temp_data;
js = json.load(f)
# Filter empty strings
files = [e['url'].strip() for e in js['entries'] if e['url'].strip()]
self.files = [e['url'].strip() for e in js['entries'] if e['url'].strip()]
return files
return self.files
else:
return status # canceled or error
def results(self):
# if it's not a list, then we've failed
if type(self.manifest_files()) != type([]):
raise Exception(f"""Query has status {self.status()} did not complete and
thus has no results""")
self.temps = [self.handler.s3.download(f) for f in self.manifest_files()]
#local = parallel_map(self.handler.s3.download, self.manifest_files())
self.ds = pa.dataset.dataset(self.temps)