forked from bellwether/minerva
adding dask examples
This commit is contained in:
parent
c0ff6af866
commit
fe06b6b808
6 changed files with 149 additions and 18 deletions
65
dask_test.py
Normal file
65
dask_test.py
Normal file
|
|
@ -0,0 +1,65 @@
|
|||
import sys
|
||||
import minerva
|
||||
from minerva.timing import Timing
|
||||
import dask
|
||||
from dask.distributed import Client
|
||||
|
||||
m = minerva.Minerva("hay")
|
||||
|
||||
print(f"connecting to {sys.argv[1]}")
|
||||
client = Client(sys.argv[1])
|
||||
|
||||
try:
|
||||
# Practice with a big array
|
||||
# https://matthewrocklin.com/blog/work/2017/01/12/dask-dataframes
|
||||
|
||||
#import numpy as np
|
||||
#import dask.array as da
|
||||
import dask.dataframe as dd
|
||||
import time
|
||||
|
||||
# https://stackoverflow.com/questions/43796774/loading-local-file-from-client-onto-dask-distributed-cluster
|
||||
|
||||
athena = m.athena("s3://haystac-pmo-athena/")
|
||||
|
||||
# Iteratively load files and scatter them to the cluster
|
||||
with Timing("athena query"):
|
||||
query = athena.execute("select * from trajectories.kitware limit 1000000", format='csv')
|
||||
query.finish()
|
||||
df = dd.read_csv(query.info_cache['ResultConfiguration']['OutputLocation'])
|
||||
|
||||
#query = athena.query("select * from trajectories.kitware limit 100000000")
|
||||
#df = dd.read_parquet(query.manifest_files())
|
||||
#df = query.distribute_results(client, size=10000)
|
||||
print("distributed")
|
||||
|
||||
with Timing("partitioning"):
|
||||
#df = df.categorize('agent')
|
||||
#df['agent'] = df['agent'].cat.as_ordered()
|
||||
#partitions = df.describe(include=['category']).compute().iloc[0][0]
|
||||
#df = df.set_index('agent', npartitions=partitions)
|
||||
|
||||
divisions = list(range(0, 10001))
|
||||
df = df.set_index('agent', divisions=divisions)
|
||||
|
||||
with Timing("persisting"):
|
||||
dp = df.persist()
|
||||
|
||||
import IPython
|
||||
IPython.embed()
|
||||
|
||||
with Timing("count()"):
|
||||
print(dp.count().compute())
|
||||
|
||||
with Timing("mean latitude"):
|
||||
print(dp.map_partitions(lambda p: p.groupby(p.index).latitude.mean()).compute())
|
||||
|
||||
with Timing("mean longitude"):
|
||||
print(dp.map_partitions(lambda p: p.groupby(p.index).longitude.mean()).compute())
|
||||
|
||||
finally:
|
||||
########## FIN #######################
|
||||
client.close()
|
||||
print("end")
|
||||
input()
|
||||
|
||||
Loading…
Add table
Add a link
Reference in a new issue