minerva/examples/export_table.py

import minerva
import math

m      = minerva.Minerva("hay-te")
athena = m.athena("s3://haystac-te-athena/")

partition_by = "chunk"
table = "haystacdb.kitware_med"
cols  = athena.describe_columns(table)

out  = "haystacdb.exported_table"
dest = "s3://haystac-te-athena/test_export/"

# Have to use different datatypes here
cols = [(name, "int") if dtype == "long" else (name, dtype)
        for name, dtype in cols]
cols = [(name, "int") if dtype == "integer" else (name, dtype)
        for name, dtype in cols]

# TODO might have to replace integer with int
part_col  = [x for x in cols if x[0] == partition_by][0]
rest_cols = [x for x in cols if x[0] != partition_by]

string = f"""
create external table {out}({', '.join(map(lambda x: ' '.join(x), rest_cols))})
partitioned by ({' '.join(part_col)})
location '{dest}'
tblproperties ('spark.sql.sources.provider' = 'delta',
               'parquet.compression' = 'zstd')
"""
print(string)

try:
    e = athena.execute(string)
    e.finish()

    count = athena.query(f"select count(distinct {partition_by}) as count from {table}").scalar()

    concurrent = 100 # 100 is max concurrent writers allowed

    col_names = map(lambda x: x[0], cols)

    for i in range(0, count, concurrent):
        print(i)
        sql = f"insert into {out} select {', '.join(col_names)} from {table} where {partition_by} >= {i} and {partition_by} < {i + concurrent}"
        print(sql)
        q = athena.query(sql)
        q.finish()

finally:
    athena.delete_table("haystacdb", out)