Commit 9cfd5a48 by Selah Clarity

Deal with nans in import data

1 parent 00038591
......@@ -3,6 +3,7 @@
import pandas as pd
import numpy as np
import sqlalchemy
def create_table_sql(table_name, column_def):
......@@ -19,10 +20,18 @@ def create_and_import(data, table_name, table_def, conn, max_insert=1000):
cts = create_table_sql(table_name, table_def)
conn.execute(cts)
dtypes = get_dtypes_from_table_def(table_def)
print(dtypes)
insert_sql_generator = generate_insert_sql(table_name, data.columns, dtypes, data, max_insert=max_insert)
fline = 0
for insert_chunk_sql in insert_sql_generator:
try:
#TODO - handle this so long SQL insert doesn't obscure the screen
conn.execute(insert_chunk_sql)
# except sqlalchemy.exc.OperationalError as e:
# raise e
except Exception as e:
print("There was an exception during insert:\n" + e._message() + e.statement[:500] + '\n')
print("Aborting create_and_import()")
return False
end = time.time()
duration = end - start
line_cnt = len(data)
......@@ -51,7 +60,9 @@ def format_data_for_insert(rows, column_types):
row_fmt = zip(row, column_types)
items_fmttd = []
for (item, fmt) in row_fmt:
if (fmt == 'STR') | (fmt == 'DT'):
if item is np.nan:
item_fmttd = 'NULL'
elif (fmt == 'STR') | (fmt == 'DT'):
item_fmttd = "'{}'".format(item)
else:
item_fmttd = "{}".format(item)
......
import pandas as pd
import numpy as np
import unittest
import sqlalchemy
......@@ -17,6 +18,42 @@ class TestStuff(unittest.TestCase):
def test_something(self):
self.assertEqual(2+1, 3)
def test_format_row_for_insert_nans(self):
insert_chunk = np.array([[61,'hello','2020-08-21'],[np.NAN,np.NAN,np.NAN]], dtype='object')
column_types = ['NUM','STR','DT']
rows_of_data = bulk_insert.format_data_for_insert(insert_chunk, column_types)
self.assertEqual(rows_of_data, "(61,'hello','2020-08-21'),\n(NULL,NULL,NULL)")
def test_burris_pull_enc_short(self):
projdatadir = 'C:\\Users\\LynchSe\\Documents\\Data\\Burris_Geobirth\\'
d = '2021_05_26'
dfr = pd.read_csv(projdatadir + 'cohort_pat_delivery_{}.csv'.format(d))#.sample(1000)
df = dfr.iloc[30053:30069]
import bulk_insert
table_def = '''
HUP_MRN VARCHAR(30),
PAT_ID VARCHAR(18),
DELIVERY_DATE DATETIME,
HOSPITAL VARCHAR(20)
'''
bulk_insert.create_and_import(df, '##test_burris_pull_enc_short', table_def, conn, max_insert=200)
def test_burris_pull_enc(self):
projdatadir = 'C:\\Users\\LynchSe\\Documents\\Data\\Burris_Geobirth\\'
d = '2021_05_26'
df = pd.read_csv(projdatadir + 'cohort_pat_delivery_{}.csv'.format(d))#.sample(1000)
import bulk_insert
table_def = '''
HUP_MRN VARCHAR(30),
PAT_ID VARCHAR(18),
DELIVERY_DATE DATETIME,
HOSPITAL VARCHAR(20)
'''
bulk_insert.create_and_import(df, '##test_burris_pull_enc', table_def, conn, max_insert=1000)
def test_integration_1(self):
# Integration test, desired workflow as of July 2021
......@@ -44,8 +81,11 @@ class TestStuff(unittest.TestCase):
if __name__ == '__main__':
unittest.main()
# t = TestStuff()
# unittest.main()
t = TestStuff()
t.test_burris_pull_enc()
t.test_burris_pull_enc_short()
# t.test_format_row_for_insert_nans()
# t.test_something()
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!