added more examples

2024-07-03 11:04:11 -04:00 · 2024-07-03 11:04:11 -04:00 · 745919e587
commit 745919e587
parent a3374fd85c
3 changed files with 138 additions and 0 deletions
--- a/examples/parallelization.py
+++ b/examples/parallelization.py
@ -0,0 +1,35 @@
 import minerva
 import pprint
 pp = pprint.PrettyPrinter(indent=4)
 # Create the Minerva object which gives you access to the account under the
 # profile `hay`
 m = minerva.Minerva("hay")
 # Get the Athena object
 athena = m.athena("s3://haystac-pmo-athena/")
 # Parallelize across the `data` and split it into `n` chunks, one chunk per process.
 # Since `num_agents` is a number, it's turned into a range and then split.
 num_agents = 10000
 parallel   = athena.parallelize("trajectories", n = 200, data = num_agents)
 for agents in parallel:
    # Everything *needs* to have a column in order for unloading to parquet to work,
    # so scalar values have to be assigned something, so here we use `as count` to
    # create a temporary column called `count`
    sql = f"""
    select count(*) as cnt
    from trajectories.basline
    where agent >= {min(agents)} and
          agent <  {max(agents)}
    group by agent
    """
    parallel << athena.query(query, partition = {"agent": agents})
 pp.pprint(parallel.results().head(10))
 # We also get important statistics
 print(parallel.runtime)
 print(parallel.cost)
--- a/examples/repartition.py
+++ b/examples/repartition.py
@ -0,0 +1,86 @@
 import minerva as m
 import servers as s
 import json
 import math
 import sys
 src_top_level   = "s3://phase1.trial2/ta1.kitware/te.apl/transforms/plain/"
 dst_top_level   = "s3://phase1.trial2/ta1.kitware/te.apl/transforms/ari_sorted/"
 def sort_hour(mach, hour):
    image = "436820952613.dkr.ecr.us-east-1.amazonaws.com/apl-pyarrow-experiment"
    # Prep the info for the docker container
    srted_loc  = src_top_level +  '/'.join(hour.split('/')[-4:])
    srted_loc += "/data.zstd.parquet"
    variables  = {"source":      hour,       # hour
                  "destination": srted_loc } # precise location of new file
    # Create the machine to run it
    dock = m.Docker(machine   = mach,
                    container = image,
                    variables = {"PAYLOAD": json.dumps(variables)},
                    stdout    = sys.stdout,
                    stderr    = sys.stderr)
    dock.run()
 def repartition(mach, agents):
    image = "436820952613.dkr.ecr.us-east-1.amazonaws.com/apl-pyarrow-experiment-agent"
    # Prep the info for the docker container
    variables = {"min_agent":   min(agents),
                 "max_agent":   max(agents),
                 "source":      src_top_level,
                 "destination": dst_top_level,
                 "secondary_destination": None}
    # Create the machine to run it
    dock = m.Docker(machine   = mach,
                    container = image,
                    variables = {"PAYLOAD": json.dumps(variables)},
                    stdout    = sys.stdout,
                    stderr    = sys.stderr)
    dock.run()
 #####################################
 # Prep the work
 # Find out how many hours there are in the dataset
 pool_size = 1
 objs  = s.m.s3.ls(src_top_level + "year=")
 hours = set(["s3://" + '/'.join([o.bucket_name, *o.key.split("/")[0:-1]])
             for o in objs])
 print(f"{len(hours)} hours to sort")
 hours = sorted(hours)
 hours = [hours[0]]
 # Split the agents into chunks for each machine in the pool
 agents = list(range(200))
 size   = math.ceil(len(agents) / pool_size)
 groups = [agents[i:i + size] for i in range(0, len(agents), size)]
 try:
    #######################################
    # Create the machines
    # This also waits for them to be made
    pool      = m.Pool(s.worker, pool_size)
    ########################################
    # Now that we have the pool, put them to work
    # Each will pull an item off of `data`, process it, and then keep
    # doing that until the list is empty
    # First part: sort the individual files
    pool.run(sort_hour, data=hours)
    # Second part: repartition
    pool.run(repartition, data=groups)
 finally:
    pool.terminate()
    print(f"Cost: ${pool.cost()}")
--- a/examples/servers.py
+++ b/examples/servers.py
@ -0,0 +1,17 @@
 import minerva
 import json
 m    = minerva.Minerva("hay-te")
 pier = m.pier(subnet_id = "subnet-08438df942a357b21", # haystac-te-subnet-public1-us-east-1c
              sg_groups = ["sg-005d1f7b02f1e4b06",    # ssh
                           "sg-06f81d2d2d58dfc6b"],   # default
              iam       = "Minerva",
              key_pair  = ("Ari-Brown-HAY-TE", "~/.ssh/Ari-Brown-HAY-TE.pem"))
 def worker(num):
    return pier.machine(instance_type = "r6a.2xlarge",
                        username      = "ubuntu",
                        name          = f"minerva-worker-{num}",
                        ami           = "ami-0796c86095e0ac8fe",
                        disk_size     = 512)