bulk_insert.py
2.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#GOAL insert a csv into a temp table in Clarity
#This library takes a dataframe and imports it into Clarity
import pandas as pd
import numpy as np
def create_table_sql(table_name, column_def):
create_table_sql = """
DROP TABLE IF EXISTS {table_name};
CREATE TABLE {table_name} ({table_column_def});
""".format(table_name=table_name, table_column_def=column_def)
return create_table_sql
def format_data_for_insert(rows, column_types):
data_formatted = ""
for row in rows:
row_fmt = zip(row, column_types)
items_fmttd = []
for (item, fmt) in row_fmt:
if (fmt == 'STR') | (fmt == 'DT'):
item_fmttd = "'{}'".format(item)
else:
item_fmttd = "{}".format(item)
items_fmttd.append(item_fmttd)
row_formatted = ",".join([str(item) for item in items_fmttd])
row_formatted = "({})".format(row_formatted)
data_formatted += row_formatted + ",\n"
data_formatted = data_formatted[0:-2]
return data_formatted
def generate_insert_sql(table_name, column_names, column_types, data, max_insert=1000):
if type(data) is pd.core.frame.DataFrame:
data = data.values
import math
num_splits = math.ceil(len(data)/max_insert)
data_split = np.array_split(data, num_splits)
column_names_str = ','.join(column_names)
insert_chunk_sql_template = "INSERT INTO {table} ({column_names_str})\nvalues\n{rows_of_data}\n"
for insert_chunk in data_split:
rows_of_data = format_data_for_insert(insert_chunk, column_types)
insert_chunk_sql = insert_chunk_sql_template.format(table = table_name, column_names_str = column_names_str, rows_of_data = rows_of_data )
yield insert_chunk_sql
def collect_insert_sql(table_name, column_names, column_types, data, max_insert=1000):
insert_sql_generator = generate_insert_sql(table_name, column_names, column_types, data, max_insert)
insert_sql = ""
for insert_chunk_sql in insert_sql_generator:
insert_sql += insert_chunk_sql + ";\n"
return insert_sql