import minerva import pprint pp = pprint.PrettyPrinter(indent=4) # Create the Minerva object which gives you access to the account under the # profile `hay` m = minerva.Minerva("hay") # Get the Athena object athena = m.athena("s3://haystac-pmo-athena/") # Parallelize across the `data` and split it into `n` chunks, one chunk per process. # Since `num_agents` is a number, it's turned into a range and then split. num_agents = 10000 parallel = athena.parallelize("trajectories", n = 200, data = num_agents) for agents in parallel: # Everything *needs* to have a column in order for unloading to parquet to work, # so scalar values have to be assigned something, so here we use `as count` to # create a temporary column called `count` sql = f""" select count(*) as cnt from trajectories.basline where agent >= {min(agents)} and agent < {max(agents)} group by agent """ parallel << athena.query(query, partition = {"agent": agents}) pp.pprint(parallel.results().head(10)) # We also get important statistics print(parallel.runtime) print(parallel.cost)