import sys import minerva from minerva.timing import Timing import dask import dask.dataframe as dd import time #dask.config.set({'distributed.worker.multiprocessing-method': 'fork'}) import dask.distributed from dask.distributed import Client m = minerva.Minerva("hay") print(f"connecting to {sys.argv[1]}") client = Client(sys.argv[1]) #manifest_files = ['s3://haystac-pmo-athena/results/0.32812059783186387/20231130_212053_00057_gqvh5_b311740c-02a5-42a5-9996-b17bdbf604ca', # 's3://haystac-pmo-athena/results/0.32812059783186387/20231130_212053_00057_gqvh5_1abe3b5f-ab17-401e-91e9-a5c4f3fa17b9', # 's3://haystac-pmo-athena/results/0.32812059783186387/20231130_212053_00057_gqvh5_c08442a6-74b8-4cf0-afc5-ab1a03f59eea', # 's3://haystac-pmo-athena/results/0.32812059783186387/20231130_212053_00057_gqvh5_399a2786-68b5-4c40-961e-476451885f7f', # 's3://haystac-pmo-athena/results/0.32812059783186387/20231130_212053_00057_gqvh5_8a699085-7d6b-426a-8a59-1ae4a6a183b9', # 's3://haystac-pmo-athena/results/0.32812059783186387/20231130_212053_00057_gqvh5_0ca91f4c-85cf-4c93-b039-4f307dde0a86', # 's3://haystac-pmo-athena/results/0.32812059783186387/20231130_212053_00057_gqvh5_bf38065d-fde8-45d3-87eb-3a56479f8b57', # 's3://haystac-pmo-athena/results/0.32812059783186387/20231130_212053_00057_gqvh5_6a0ebea5-9a23-4fb5-8547-d4bb82f6a821', # 's3://haystac-pmo-athena/results/0.32812059783186387/20231130_212053_00057_gqvh5_3464c3f6-fa5c-4d6b-bdf4-be387ff9dfda', # 's3://haystac-pmo-athena/results/0.32812059783186387/20231130_212053_00057_gqvh5_9088425e-0c87-49ef-a883-92c217a715bb', # 's3://haystac-pmo-athena/results/0.32812059783186387/20231130_212053_00057_gqvh5_0517e42b-089b-4819-b166-fa21489485a6', # 's3://haystac-pmo-athena/results/0.32812059783186387/20231130_212053_00057_gqvh5_0197ff95-8c3c-4121-8077-a499927bb097', # 's3://haystac-pmo-athena/results/0.32812059783186387/20231130_212053_00057_gqvh5_990579d4-0bae-4547-a630-e5a05341d264', # 's3://haystac-pmo-athena/results/0.32812059783186387/20231130_212053_00057_gqvh5_2c838980-65d6-4a3d-898d-bfcb63e361bd', # 's3://haystac-pmo-athena/results/0.32812059783186387/20231130_212053_00057_gqvh5_930df083-3c4c-45d6-963e-51e47d3a704c', # 's3://haystac-pmo-athena/results/0.32812059783186387/20231130_212053_00057_gqvh5_6739cc33-43e0-468d-a6d2-fa17161bbd5e', # 's3://haystac-pmo-athena/results/0.32812059783186387/20231130_212053_00057_gqvh5_685d81de-ffff-4629-bfb2-4bca3941474b', # 's3://haystac-pmo-athena/results/0.32812059783186387/20231130_212053_00057_gqvh5_41e3cb39-a917-460a-bd86-413df774806b', # 's3://haystac-pmo-athena/results/0.32812059783186387/20231130_212053_00057_gqvh5_ce533449-b14f-439e-92dc-69abdeda706f', # 's3://haystac-pmo-athena/results/0.32812059783186387/20231130_212053_00057_gqvh5_c68ff721-8b97-4abf-8573-ac3fd617af97', # 's3://haystac-pmo-athena/results/0.32812059783186387/20231130_212053_00057_gqvh5_04f13894-63d8-453c-ace7-87368511e94b', # 's3://haystac-pmo-athena/results/0.32812059783186387/20231130_212053_00057_gqvh5_0d906712-e469-419a-8bf7-508b0b79848d', # 's3://haystac-pmo-athena/results/0.32812059783186387/20231130_212053_00057_gqvh5_c514089d-c2eb-496a-a1a2-8501ecaf7d84', # 's3://haystac-pmo-athena/results/0.32812059783186387/20231130_212053_00057_gqvh5_fe04a9d7-56ad-4e99-bbfa-8adb398af971', # 's3://haystac-pmo-athena/results/0.32812059783186387/20231130_212053_00057_gqvh5_1ea10655-a063-468e-ad9e-0788eb15bd8d', # 's3://haystac-pmo-athena/results/0.32812059783186387/20231130_212053_00057_gqvh5_7146bbe6-1278-43d3-a9e3-1a9e6a776105', # 's3://haystac-pmo-athena/results/0.32812059783186387/20231130_212053_00057_gqvh5_add3ea7e-961f-4760-88b3-c839d4a344b3', # 's3://haystac-pmo-athena/results/0.32812059783186387/20231130_212053_00057_gqvh5_2a00d610-18bf-4611-8f58-f47bdd484504', # 's3://haystac-pmo-athena/results/0.32812059783186387/20231130_212053_00057_gqvh5_54ea410d-fa42-4ab5-bc2b-8179ad1cfb46', # 's3://haystac-pmo-athena/results/0.32812059783186387/20231130_212053_00057_gqvh5_07aded26-d650-4852-9d3f-732409db3fd7'] manifest_files = ["s3://haystac-pmo-athena/results/f236b9a3-8aa7-4e29-b4c7-1fee5981a0ad.csv"] def full_s3(obj): return "s3://" + obj.bucket_name + "/" + obj.key try: athena = m.athena("s3://haystac-pmo-athena/") records = 1_000_000_000 # Iteratively load files and scatter them to the cluster with Timing("read parquet from athena"): # Delta Table #files = m.s3.ls("s3://haystac-archive-phase1.trial1/ta1.kitware/ta1/simulation/train/") #parqs = [full_s3(o) for o in files if o.key.endswith('.parquet')] # 20 partitions at a time, concat 20 at a time #df = dd.read_parquet(parqs[0:20])#.astype({"agent": "category", # # "timestamp": "datetime64[ns, UTC]"}) #for i in range(20, len(parqs), 20): # new = dd.read_parquet(parqs[i:i + 20])#.astype({"agent": "category", # # "timestamp": "datetime64[ns, UTC]"}) # df = df.concat(new) # 20 partitions at a time, concat all at once #df = dd.concat([dd.read_parquet(parqs[i:i + 20], engine='fastparquet') for i in range(0, len(parqs), 20)]) # All partitions at once #df = dd.read_parquet(parqs, engine='fastparquet') # Single CSV file #query = athena.execute(f"select * from trajectories.kitware limit {records}", format="csv") #query.finish() #df = dd.read_csv(query.info_cache['ResultConfiguration']['OutputLocation']) df = dd.read_csv(manifest_files[0]) # Unload to Parquet files #query = athena.query(f"select * from trajectories.kitware limit {records}") #df = dd.read_parquet(query.manifest_files()) #df = dd.read_parquet(manifest_files, engine='fastparquet') with Timing("partitioning"): divisions = list(range(0, 10001)) df = df.set_index('agent', divisions=divisions) with Timing("persisting"): dp = df.persist() print(dp) #import IPython #IPython.embed() with Timing("memory usage"): print(dp.get_partition(400).memory_usage()) with Timing("count()"): print(dp.count()) with Timing("memory usage"): print(dp.get_partition(400).memory_usage()) with Timing("mean latitude"): print(dp.groupby(dp.index).latitude.mean()) with Timing("count()"): print(dp.count()) #with Timing("mean longitude"): # print(dp.groupby(dp.index).longitude.mean().compute()) finally: ########## FIN ####################### print("closing client") client.close() #print("end") #input()