minerva/examples/parallelization.py

import minerva
import pprint

pp = pprint.PrettyPrinter(indent=4)

# Create the Minerva object which gives you access to the account under the
# profile `hay`
m = minerva.Minerva("hay")

# Get the Athena object
athena = m.athena("s3://haystac-pmo-athena/")

# Parallelize across the `data` and split it into `n` chunks, one chunk per process.
# Since `num_agents` is a number, it's turned into a range and then split.
num_agents = 10000
parallel   = athena.parallelize("trajectories", n = 200, data = num_agents)
for agents in parallel:
    # Everything *needs* to have a column in order for unloading to parquet to work,
    # so scalar values have to be assigned something, so here we use `as count` to
    # create a temporary column called `count`
    sql = f"""
    select count(*) as cnt
    from trajectories.basline
    where agent >= {min(agents)} and
          agent <  {max(agents)}
    group by agent
    """
    parallel << athena.query(query, partition = {"agent": agents})

pp.pprint(parallel.results().head(10))

# We also get important statistics
print(parallel.runtime)
print(parallel.cost)