significant improvement to the readme and verification that all the examples work

2024-01-31 16:18:32 -05:00 · 2024-01-31 16:18:32 -05:00 · 5dccce53e9
commit 5dccce53e9
parent e3c11fb1aa
9 changed files with 275 additions and 109 deletions
--- a/examples/dask_cluster.py
+++ b/examples/dask_cluster.py
@ -1,5 +1,4 @@
-from minerva.cluster  import Cluster
-from minerva.pier     import Pier
+import minerva
 from minerva.timing   import Timing
 from dask.distributed import Client
 import dask
@ -7,7 +6,7 @@ import dask
 ########### PREP ############################

 def worker(pier, n):
-    mach = pier.machine(ami    = "ami-01f85b935dc9f674c",  # dask on ubuntu 22.04 x86
+    mach = pier.machine(ami    = "ami-0399a4f70ca684620",  # dask on ubuntu 22.04 x86
                 instance_type = "t3.medium",
                 username      = "ubuntu",
                 name          = f"dask-worker-{n}",
@ -17,24 +16,25 @@ def worker(pier, n):
    return mach

 def scheduler(pier):
-    mach = pier.machine(ami    = "ami-01f85b935dc9f674c",  # dask on ubuntu 22.04 x86
+    mach = pier.machine(ami    = "ami-0399a4f70ca684620",  # dask on ubuntu 22.04 x86
                 instance_type = "t3.medium",
                 username      = "ubuntu",
+                 disk_size     = 32,
                 name          = f"dask-scheduler",
                 variables     = {"type": "scheduler"})
    return mach

 ########## CLUSTER ##########################

-pier    = Pier("hay",
-               subnet_id = "subnet-05eb26d8649a093e1", # project-subnet-public1-us-east-1a
-               sg_groups = ["sg-0f9e555954e863954",    # ssh
-                            "sg-0b34a3f7398076545",    # default
-                            "sg-04cd2626d91ac093c"],   # dask (8786, 8787)
-               key_pair  = ("Ari-Brown-HAY", "~/.ssh/Ari-Brown-HAY.pem"),
-               iam       = "S3+SSM+CloudWatch+ECR")
+m       = minerva.Minerva("hay")
+pier    = m.pier(subnet_id = "subnet-05eb26d8649a093e1", # project-subnet-public1-us-east-1a
+                 sg_groups = ["sg-0f9e555954e863954",    # ssh
+                              "sg-0b34a3f7398076545",    # default
+                              "sg-04cd2626d91ac093c"],   # dask (8786, 8787)
+                 key_pair  = ("Ari-Brown-HAY", "~/.ssh/Ari-Brown-HAY.pem"),
+                 iam       = "S3+SSM+CloudWatch+ECR")

-cluster = Cluster(pier, scheduler, worker, num_workers=5)
+cluster = pier.cluster(scheduler, worker, num_workers=2)
 cluster.start()

 ########## USAGE ########################
@ -43,31 +43,9 @@ try:
    client  = Client(cluster.public_location)
    print(client)

-    # Practice with a big array
-    # https://matthewrocklin.com/blog/work/2017/01/12/dask-dataframes
-
-    #import numpy as np
-    #import dask.array as da
-    import dask.dataframe as dd
-    import time
-
-    # https://stackoverflow.com/questions/43796774/loading-local-file-from-client-onto-dask-distributed-cluster
-
-    # Iteratively load files and scatter them to the cluster
-    #
-    # futures = []
-    # for fn in filenames:
-    #     df = pd.read_csv(fn)
-    #     future = client.scatter(df)
-    #     futures.append(future)
-    #
-    # ddf = dd.from_delayed(futures, meta=df)
-
-    # query = athena.query("select * from trajectories")
-    # ddf   = query.distribute_results(client)
-
-    with Timing("reading parquet"):
-        df = dd.read_parquet("s3://haystac-archive-phase1.trial1/ta1.kitware/ta1/simulation/train/")
+    athena = m.athena("s3://haystac-pmo-athena/")
+    query  = athena.query("select * from trajectories.basline where agent < 100")
+    df     = query.distribute_results(client)

    with Timing("persisting"):
        dp = df.persist()