diff --git a/README.md b/README.md new file mode 100644 index 0000000..bd5ea02 --- /dev/null +++ b/README.md @@ -0,0 +1,44 @@ +# Minerva +Minerva is the Roman equivalent of Athena, and Athena is AWS's database that +stores results in S3. + +In order to ease programmatic access to Athena and offer blocking access (so +that your code waits for the result), I wrote `minerva` to make it seamless. + +The results are returned as pyarrow datasets (with parquet files as the +underlying structure). + +# Basic Usage +``` +import access as a + +athena = a.Athena("hay", "s3://haystac-pmo-athena/") +query = athena.query('select * from "trajectories"."kitware" limit 10') +data = query.results() +print(data.head(10)) +``` + +First, a connection to Athena is made. The first argument is the AWS profile in +`~/.aws/credentials`. The second argument is the S3 location where the results +will be stored. + +In the second substantive line, an SQL query is made. This is **non-blocking**. +The query is off and running and you are free to do whatever you want now. + +In the third line, the results are requested. This is **blocking**, so the code +will wait here (checking with AWS every 5 seconds) until the results are ready. +Then, the results are downloaded to `/tmp/` and lazily interpreted as parquet +files in the form of a `pyarrow.dataset.dataset`. + +# Returning Scalar Values +In SQL, scalar values get assigned an anonymous column -- Athena doesn't like +that. Thus, you have to assign the column a name. + +``` +data = athena.query('select count(*) as my_col from "trajectories"."kitware"').results() +print(data.head(1)) +``` + +# TODO +* parallelize the downloading of files + diff --git a/access.py b/access.py index 2befd90..d6c7fe4 100644 --- a/access.py +++ b/access.py @@ -66,6 +66,8 @@ class Query: # Because we're using `UNLOAD`, we get a manifest of the files # that make up our data. files = self.manifest(tiedot).strip().split("\n") + + # TODO parallelize this local = [self.handler.download(f) for f in files] self.ds = pa.dataset.dataset(local)