bulk_insert_test.py
3.99 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import pandas as pd
import numpy as np
import unittest
import sqlalchemy
import bulk_insert
import clarity_to_csv as ctc
conn = ctc.get_clarity_engine(host='claritydev.uphs.upenn.edu').connect()
#conn = ctc.get_clarity_engine(host='clarityprod.uphs.upenn.edu').connect()
#TODO - scramble this damn data so it isn't PHI
class TestStuff(unittest.TestCase):
#How to deal with, I don't know!!
def test_error_in_burris_meds_insert(self):
datadir = "C:/Users/LynchSe/Documents/Data/Burris_Geobirth/"
dfmed = pd.read_csv(datadir + "from_burris_lab/Copy of GeoBirth_med_counts_20210713.csv")
sgids = dfmed.SIMPLE_GENERIC_C.unique()
dfsgids_raw = pd.DataFrame({'SIMPLE_GENERIC_C':sgids})
dfsgids = dfsgids_raw.loc[18:23]
import bulk_insert
table_def = '''
SIMPLE_GENERIC_C INT
'''
#there were overflow issues
bulk_insert.create_and_import(dfsgids, '##sgmedids', table_def, conn)
def test_something(self):
self.assertEqual(2+1, 3)
def test_format_row_for_insert_nans(self):
insert_chunk = np.array([[61,'hello','2020-08-21'],[np.NAN,np.NAN,np.NAN]], dtype='object')
column_types = ['NUM','STR','DT']
rows_of_data = bulk_insert.format_data_for_insert(insert_chunk, column_types)
self.assertEqual(rows_of_data, "(61,'hello','2020-08-21'),\n(NULL,NULL,NULL)")
def test_burris_pull_enc_short(self):
projdatadir = 'C:\\Users\\LynchSe\\Documents\\Data\\Burris_Geobirth\\'
d = '2021_05_26'
dfr = pd.read_csv(projdatadir + 'cohort_pat_delivery_{}.csv'.format(d))#.sample(1000)
df = dfr.iloc[30053:30069]
import bulk_insert
table_def = '''
HUP_MRN VARCHAR(30),
PAT_ID VARCHAR(18),
DELIVERY_DATE DATETIME,
HOSPITAL VARCHAR(20)
'''
bulk_insert.create_and_import(df, '##test_burris_pull_enc_short', table_def, conn, max_insert=200)
## REMEMBER TO LOOK IN CLARITYDEV, NOT PROD
def test_burris_pull_enc(self):
projdatadir = 'C:\\Users\\LynchSe\\Documents\\Data\\Burris_Geobirth\\'
d = '2021_05_26'
df = pd.read_csv(projdatadir + 'cohort_pat_delivery_{}.csv'.format(d))#.sample(1000)
import bulk_insert
table_def = '''
HUP_MRN VARCHAR(30),
PAT_ID VARCHAR(18),
DELIVERY_DATE DATETIME,
HOSPITAL VARCHAR(20)
'''
bulk_insert.create_and_import(df, '##test_burris_pull_enc', table_def, conn, max_insert=1000)
## REMEMBER TO LOOK IN CLARITYDEV, NOT PROD
def test_integration_1(self):
# Integration test, desired workflow as of July 2021
datadir = 'C:/Users/LynchSe/Documents/Data/Clarity_Tools_Selah/'
dfp = pd.read_csv(datadir + "test_cohort_36559.csv")
dfc = dfp[['PAT_ID', 'BIRTH_DATE', 'ENC_CNT_2017']]
tabledef = '''PAT_ID VARCHAR(18) NOT NULL, BIRTH_DATE DATETIME, ENC_CNT_2017 INTEGER
'''
import clarity_to_csv as ctc
conn = ctc.get_clarity_engine().connect()
bulk_insert.create_and_import(dfc, '##cohort_sample2', tabledef, conn)
## REMEMBER TO LOOK IN CLARITYDEV, NOT PROD
def test_integration_2(self):
datadir = 'C:/Users/LynchSe/Documents/Data/Clarity_Tools_Selah/'
dfcohort_inp1 = pd.read_excel(datadir + 'geobirth_patient_list_2018_2021.xls')
#TODO - scramble this data so it isn't PHI
import bulk_insert
table_def = 'MRN VARCHAR(100)'
bulk_insert.create_and_import(dfcohort_inp1[['mrn']], '##cohort_inp', table_def, conn, max_insert=1000)
## REMEMBER TO LOOK IN CLARITYDEV, NOT PROD
if __name__ == '__main__':
# unittest.main()
t = TestStuff()
t.test_error_in_burris_meds_insert()
# t.test_burris_pull_enc()
# t.test_burris_pull_enc_short()
# t.test_format_row_for_insert_nans()
# t.test_something()
''''''