forked from bellwether/minerva
significant improvement to the readme and verification that all the examples work
This commit is contained in:
parent
e3c11fb1aa
commit
5dccce53e9
9 changed files with 275 additions and 109 deletions
|
|
@ -43,14 +43,15 @@ class Execute:
|
|||
Execute is meant to be used for DML statements such as CREATE DATABASE/TABLE
|
||||
"""
|
||||
def __init__(self, redshift, sql):
|
||||
self.redshift = redshift
|
||||
self.handler = redshift.handler
|
||||
self.client = redshift.client
|
||||
self.sql = sql
|
||||
self.info_cache = None
|
||||
self.ds = None
|
||||
self.files = None
|
||||
self.temps = []
|
||||
self.redshift = redshift
|
||||
self.handler = redshift.handler
|
||||
self.client = redshift.client
|
||||
self.sql = sql
|
||||
self.info_cache = None
|
||||
self.status_cache = None
|
||||
self.ds = None
|
||||
self.files = None
|
||||
self.temps = []
|
||||
|
||||
def query(self):
|
||||
return self.sql
|
||||
|
|
@ -58,9 +59,9 @@ class Execute:
|
|||
|
||||
def run(self):
|
||||
if self.redshift.cluster:
|
||||
resp = self.client.execute_statement(Sql=self.query(),
|
||||
Database=self.redshift.database,
|
||||
ClusterIdentifier=self.redshift.cluster)
|
||||
resp = self.client.execute_statement(Sql = self.query(),
|
||||
Database = self.redshift.database,
|
||||
ClusterIdentifier = self.redshift.cluster)
|
||||
else:
|
||||
params = {"WorkgroupName": self.redshift.workgroup}
|
||||
if self.redshift.secret:
|
||||
|
|
@ -79,8 +80,13 @@ class Execute:
|
|||
|
||||
|
||||
def info(self):
|
||||
if self.status_cache in ['FINISHED', 'ABORTED', 'FAILED']:
|
||||
return self.info_cache
|
||||
|
||||
res = self.client.describe_statement(Id=self.query_id)
|
||||
self.info_cache = res
|
||||
self.info_cache = res
|
||||
self.status_cache = res['Status']
|
||||
|
||||
return self.info_cache
|
||||
|
||||
|
||||
|
|
@ -94,7 +100,8 @@ class Execute:
|
|||
self.runtime = self.info_cache['UpdatedAt'] - self.info_cache['CreatedAt']
|
||||
|
||||
if self.redshift.rpus:
|
||||
self.cost = 0.36 * self.redshift.rpus * self.runtime.seconds / 3600.0 # $0.36 / RPU-hour
|
||||
# $0.36 / RPU-hour
|
||||
self.cost = 0.36 * self.redshift.rpus * self.runtime.seconds / 3600.0
|
||||
|
||||
return stat # finalized state
|
||||
|
||||
|
|
@ -104,8 +111,8 @@ class Query(Execute):
|
|||
|
||||
def query(self):
|
||||
self.out = os.path.join(self.redshift.output,
|
||||
str(random.random()),
|
||||
'')
|
||||
"results",
|
||||
str(random.random()) + ".")
|
||||
#query = f"unload ({repr(self.sql)}) to {repr(self.out)} " + \
|
||||
# f"iam_role default " + \
|
||||
# f"format as {self.DATA_STYLE} " + \
|
||||
|
|
@ -119,10 +126,14 @@ format as {self.DATA_STYLE}
|
|||
manifest;
|
||||
drop table temp_data;
|
||||
"""
|
||||
print(query)
|
||||
return query
|
||||
|
||||
|
||||
def manifest_files(self):
|
||||
if self.files:
|
||||
return self.files
|
||||
|
||||
status = self.finish()
|
||||
|
||||
if status == "FINISHED":
|
||||
|
|
@ -137,14 +148,19 @@ drop table temp_data;
|
|||
js = json.load(f)
|
||||
|
||||
# Filter empty strings
|
||||
files = [e['url'].strip() for e in js['entries'] if e['url'].strip()]
|
||||
self.files = [e['url'].strip() for e in js['entries'] if e['url'].strip()]
|
||||
|
||||
return files
|
||||
return self.files
|
||||
else:
|
||||
return status # canceled or error
|
||||
|
||||
|
||||
def results(self):
|
||||
# if it's not a list, then we've failed
|
||||
if type(self.manifest_files()) != type([]):
|
||||
raise Exception(f"""Query has status {self.status()} did not complete and
|
||||
thus has no results""")
|
||||
|
||||
self.temps = [self.handler.s3.download(f) for f in self.manifest_files()]
|
||||
#local = parallel_map(self.handler.s3.download, self.manifest_files())
|
||||
self.ds = pa.dataset.dataset(self.temps)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue