minerva/examples/parallelization.py
2024-07-03 11:04:11 -04:00

35 lines
1.1 KiB
Python

import minerva
import pprint
pp = pprint.PrettyPrinter(indent=4)
# Create the Minerva object which gives you access to the account under the
# profile `hay`
m = minerva.Minerva("hay")
# Get the Athena object
athena = m.athena("s3://haystac-pmo-athena/")
# Parallelize across the `data` and split it into `n` chunks, one chunk per process.
# Since `num_agents` is a number, it's turned into a range and then split.
num_agents = 10000
parallel = athena.parallelize("trajectories", n = 200, data = num_agents)
for agents in parallel:
# Everything *needs* to have a column in order for unloading to parquet to work,
# so scalar values have to be assigned something, so here we use `as count` to
# create a temporary column called `count`
sql = f"""
select count(*) as cnt
from trajectories.basline
where agent >= {min(agents)} and
agent < {max(agents)}
group by agent
"""
parallel << athena.query(query, partition = {"agent": agents})
pp.pprint(parallel.results().head(10))
# We also get important statistics
print(parallel.runtime)
print(parallel.cost)