From ab344374d9e9dd13d5570bb2a28176f42a2288af Mon Sep 17 00:00:00 2001 From: Ari Brown Date: Fri, 15 Sep 2023 10:20:53 -0400 Subject: [PATCH] added binary file for an easy athena console --- bin/minerva-console | 32 ++++++++++++++++++++++++++++++++ minerva/athena.py | 13 +++++++++++++ pyproject.toml | 8 +++++--- 3 files changed, 50 insertions(+), 3 deletions(-) create mode 100755 bin/minerva-console diff --git a/bin/minerva-console b/bin/minerva-console new file mode 100755 index 0000000..d95ae1a --- /dev/null +++ b/bin/minerva-console @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 +import minerva +import pprint +import readline +import argparse + +pp = pprint.PrettyPrinter(indent=4) + +parser = argparse.ArgumentParser(description=""" +REPL for the Athena SQL engine +""") +parser.add_argument("-p", "--profile", default="hay", help="The AWS profile to use") +parser.add_argument("-o", "--output", default="s3://haystac-pmo-athena/output") +args = parser.parse_args() + +m = minerva.Minerva(args.profile) +athena = m.athena(args.output) + +text = input("> ") +while text != "\\q": + query = athena.query(text) + + try: + data = query.results() + pp.pprint(data.head(10)) + print() + print(f"\t({'$%.2f' % query.cost}, {query.runtime})") + except Exception as e: + print(e) + + text = input("> ") + diff --git a/minerva/athena.py b/minerva/athena.py index 3fd9d59..75e39e4 100644 --- a/minerva/athena.py +++ b/minerva/athena.py @@ -42,6 +42,7 @@ class Execute: self.params = [str(p) for p in params] self.info_cache = None self.temps = [] + self.ds = None # The string of the query def query(self): @@ -80,6 +81,10 @@ class Execute: ms = self.info_cache['Statistics']['TotalExecutionTimeInMillis'] self.runtime = datetime.timedelta(seconds=ms / 1000) + + scanned = self.info_cache['Statistics']['DataScannedInBytes'] + self.cost = 5.0 * scanned / (1024 ** 4) # $5/TB scanned + return stat # finalized state @@ -122,11 +127,19 @@ class Query(Execute): # dataset of the results. # Calls `self.manifest_files()` which blocks via `self.finish()` def results(self): + if self.ds: + return self.ds + self.temps = [self.handler.s3.download(f) for f in self.manifest_files()] #local = parallel_map(self.handler.s3.download, self.manifest_files()) self.ds = pa.dataset.dataset(self.temps) return self.ds + # Return scalar results + # Abstracts away a bunch of keystrokes + def scalar(self): + return self.results().head(1)[0][0].as_py() + def __enter__(self): return self diff --git a/pyproject.toml b/pyproject.toml index 32d4933..ce4de04 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,13 +1,15 @@ [tool.poetry] name = "minerva" -version = "0.3.5" +version = "0.4.0" description = "Easier access to AWS Athena and Redshift" authors = [ "Ari Brown ", - "Roshan Punnoose " + "Roshan Punnoose ", + "Alex Zabriskie " ] packages = [ - { include = "minerva/**/*.py"} + { include = "minerva/**/*.py" }, + { include = "bin/*" } ] readme = "README.md"