Skip to content
Toggle navigation
Projects
Groups
Snippets
Help
Toggle navigation
This project
Loading...
Sign in
cicTeam
/
Clarity_Tools_Selah
Go to a project
Project
Repository
Issues
0
Merge Requests
0
Pipelines
Wiki
Settings
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit 00038591
authored
Jul 21, 2021
by
Selah Clarity
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
simplify import data to clarity, clean up tests
1 parent
ba1be159
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
179 additions
and
81 deletions
bulk_insert.py
bulk_insert_test.py
clarity_tests.py
clarity_to_csv.py
clarity_to_csv_test.py
sqlite_demo.py
test_template.py
bulk_insert.py
View file @
0003859
...
@@ -13,6 +13,38 @@ def create_table_sql(table_name, column_def):
...
@@ -13,6 +13,38 @@ def create_table_sql(table_name, column_def):
return
create_table_sql
return
create_table_sql
def
create_and_import
(
data
,
table_name
,
table_def
,
conn
,
max_insert
=
1000
):
import
time
start
=
time
.
time
()
cts
=
create_table_sql
(
table_name
,
table_def
)
conn
.
execute
(
cts
)
dtypes
=
get_dtypes_from_table_def
(
table_def
)
print
(
dtypes
)
insert_sql_generator
=
generate_insert_sql
(
table_name
,
data
.
columns
,
dtypes
,
data
,
max_insert
=
max_insert
)
for
insert_chunk_sql
in
insert_sql_generator
:
conn
.
execute
(
insert_chunk_sql
)
end
=
time
.
time
()
duration
=
end
-
start
line_cnt
=
len
(
data
)
print
(
"{} lines imported in {:.1f} s"
.
format
(
line_cnt
,
duration
))
#WARNING - can't handle table names with spaces in them
def
get_dtypes_from_table_def
(
table_def
):
import
re
column_defs
=
[
(
var_def
)
.
strip
()
for
var_def
in
table_def
.
split
(
','
)
]
column_types
=
[]
for
column_def
in
column_defs
:
if
re
.
search
(
r'VARCHAR'
,
column_def
):
column_types
.
append
(
'STR'
)
elif
re
.
search
(
r'INT'
,
column_def
):
column_types
.
append
(
'NUM'
)
elif
re
.
search
(
r'DATETIME'
,
column_def
):
column_types
.
append
(
'DT'
)
else
:
raise
Exception
(
"Unrecognized column definition data type"
)
return
column_types
def
format_data_for_insert
(
rows
,
column_types
):
def
format_data_for_insert
(
rows
,
column_types
):
data_formatted
=
""
data_formatted
=
""
for
row
in
rows
:
for
row
in
rows
:
...
...
bulk_insert_test.py
View file @
0003859
import
pandas
as
pd
import
pandas
as
pd
import
numpy
as
np
import
unittest
import
sqlalchemy
import
bulk_insert
import
bulk_insert
#%%
table_name
=
"##COHORT_BULK_INSERT_TEST"
test_data
=
[
[
578
,
'29389'
,
'2011-09-03'
],
[
332
,
'11384'
,
'2011-09-07'
],
[
372
,
'14487'
,
'2011-09-07'
],
[
331
,
'41384'
,
'2011-09-07'
],
[
931
,
'24587'
,
'2011-10-03'
]
]
df_test_data
=
pd
.
DataFrame
(
test_data
)
#%%
table_column_def
=
'''
PAT_ID VARCHAR(18) NOT NULL,
MRN VARCHAR(30) NOT NULL,
DELIVERY_DATE DATETIME NOT NULL'''
column_names
=
[
"MRN"
,
"PAT_ID"
,
"DELIVERY_DATE"
]
column_types
=
[
'NUM'
,
'STR'
,
'DT'
]
#%%
import
bulk_insert
print
(
bulk_insert
.
create_table_sql
(
table_name
,
table_column_def
))
#%%
insert_sql_generator
=
bulk_insert
.
generate_insert_sql
(
table_name
,
column_names
,
column_types
,
df_test_data
,
max_insert
=
3
)
print
(
next
(
insert_sql_generator
))
#%%
print
(
next
(
insert_sql_generator
))
#%%
#Dataframes should never be passed to this function
#print(bulk_insert.format_data_for_insert(df_test_data, ['NUM','STR','DT']))
print
(
bulk_insert
.
format_data_for_insert
(
test_data
,
[
'NUM'
,
'STR'
,
'DT'
]))
#%%
print
(
bulk_insert
.
collect_insert_sql
(
table_name
,
column_names
,
column_types
,
df_test_data
,
max_insert
=
3
))
#%% Test with clarity
import
clarity_to_csv
as
ctc
import
clarity_to_csv
as
ctc
#conn = ctc.get_clarity_engine().connect()
conn
=
ctc
.
get_clarity_engine
(
host
=
'claritydev.uphs.upenn.edu'
)
.
connect
()
#conn = ctc.get_clarity_engine(host='clarityprod.uphs.upenn.edu').connect()
create_create_table_sql
=
"""
DROP TABLE IF EXISTS {table_name};
CREATE TABLE {table_name} ({table_column_def});
#TODO - scramble this damn data so it isn't PHI
"""
.
format
(
table_name
=
table_name
,
table_column_def
=
table_column_def
)
class
TestStuff
(
unittest
.
TestCase
):
import
bulk_insert
insert_sql_generator
=
bulk_insert
.
generate_insert_sql
(
table_name
,
column_names
,
column_types
,
df_test_data
,
max_insert
=
3
)
def
test_something
(
self
):
self
.
assertEqual
(
2
+
1
,
3
)
insert_sql1
=
next
(
insert_sql_generator
)
insert_sql2
=
next
(
insert_sql_generator
)
def
test_integration_1
(
self
):
#%%
# Integration test, desired workflow as of July 2021
conn
.
execute
(
create_create_table_sql
)
conn
.
execute
(
insert_sql1
)
datadir
=
'C:/Users/LynchSe/Documents/Data/Clarity_Tools_Selah/'
conn
.
execute
(
insert_sql2
)
dfp
=
pd
.
read_csv
(
datadir
+
"test_cohort_36559.csv"
)
dfc
=
dfp
[[
'PAT_ID'
,
'BIRTH_DATE'
,
'ENC_CNT_2017'
]]
tabledef
=
'''PAT_ID VARCHAR(18) NOT NULL, BIRTH_DATE DATETIME, ENC_CNT_2017 INTEGER
'''
import
clarity_to_csv
as
ctc
conn
=
ctc
.
get_clarity_engine
()
.
connect
()
bulk_insert
.
create_and_import
(
dfc
,
'##cohort_sample2'
,
tabledef
,
conn
)
def
test_integration_2
(
self
):
datadir
=
'C:/Users/LynchSe/Documents/Data/Clarity_Tools_Selah/'
dfcohort_inp1
=
pd
.
read_excel
(
datadir
+
'geobirth_patient_list_2018_2021.xls'
)
#TODO - scramble this data so it isn't PHI
import
bulk_insert
table_def
=
'MRN VARCHAR(100)'
bulk_insert
.
create_and_import
(
dfcohort_inp1
[[
'mrn'
]],
'##cohort_inp'
,
table_def
,
conn
,
max_insert
=
1000
)
if
__name__
==
'__main__'
:
unittest
.
main
()
# t = TestStuff()
# t.test_something()
clarity_tests.py
View file @
0003859
...
@@ -18,7 +18,7 @@ def get_mssql_engine(
...
@@ -18,7 +18,7 @@ def get_mssql_engine(
selahcredsfilename
=
'C:
\\
Users
\\
LynchSe
\\
Documents
\\
selah_clarity_credentials.txt'
selahcredsfilename
=
'C:
\\
Users
\\
LynchSe
\\
Documents
\\
selah_clarity_credentials.txt'
def
get_clarity_engine
(
credsfilename
=
selahcredsfilename
,
timeout
=
600
):
def
get_clarity_engine
(
credsfilename
=
selahcredsfilename
,
timeout
=
600
,
host
=
'clarityprod.uphs.upenn.edu'
):
with
open
(
credsfilename
,
'r'
)
as
credsfile
:
with
open
(
credsfilename
,
'r'
)
as
credsfile
:
name
=
credsfile
.
readline
()
.
strip
()
name
=
credsfile
.
readline
()
.
strip
()
pw
=
credsfile
.
readline
()
.
strip
()
pw
=
credsfile
.
readline
()
.
strip
()
...
@@ -27,9 +27,11 @@ def get_clarity_engine(credsfilename = selahcredsfilename, timeout=600):
...
@@ -27,9 +27,11 @@ def get_clarity_engine(credsfilename = selahcredsfilename, timeout=600):
##### BEGIN ACTUAL TESTS #####
##### BEGIN ACTUAL TESTS #####
#because we dont' want to hit clarity more than necessary, we run tests one at a time
#because we are dealing with wierd hanging issues, we run tests one at a time
#because we don't want to hit clarity more than necessary, we use dev server
class
TestStuff
(
unittest
.
TestCase
):
class
TestStuff
(
unittest
.
TestCase
):
#Test a basic connect and execute
#Test a basic connect and execute
def
test_basic_conn_execute
(
self
):
def
test_basic_conn_execute
(
self
):
eng
=
get_clarity_engine
()
eng
=
get_clarity_engine
()
...
@@ -38,33 +40,86 @@ class TestStuff(unittest.TestCase):
...
@@ -38,33 +40,86 @@ class TestStuff(unittest.TestCase):
self
.
assertEqual
(
len
(
list
(
res
)),
3
)
self
.
assertEqual
(
len
(
list
(
res
)),
3
)
def
test_dev_conn_execute
(
self
):
eng
=
get_clarity_engine
(
host
=
'claritydev.uphs.upenn.edu'
)
with
eng
.
connect
()
as
conn
:
res
=
conn
.
execute
(
'SELECT TOP 3 PAT_ID FROM PAT_ENC'
)
self
.
assertEqual
(
len
(
list
(
res
)),
3
)
#This hangs...sometimes...why? Hung on the second time I ran it.
#Is it that it hangs after the raw connection
def
test_temp_table_persistence
(
self
):
def
test_temp_table_persistence
(
self
):
eng
=
get_clarity_engine
()
eng
=
get_clarity_engine
(
host
=
'claritydev.uphs.upenn.edu'
)
with
eng
.
connect
()
as
conn
:
with
eng
.
connect
()
as
conn
:
conn
.
execute
(
'DROP TABLE IF EXISTS ##COHORT'
)
conn
.
execute
(
'DROP TABLE IF EXISTS ##COHORT'
)
conn
.
execute
(
'SELECT TOP 3 PAT_ID INTO ##COHORT FROM PAT_ENC'
)
conn
.
execute
(
'SELECT TOP 3 PAT_ID INTO ##COHORT FROM PAT_ENC'
)
res
=
conn
.
execute
(
'SELECT * FROM ##COHORT'
)
res
=
conn
.
execute
(
'SELECT * FROM ##COHORT'
)
self
.
assertEqual
(
len
(
list
(
res
)),
3
)
self
.
assertEqual
(
len
(
list
(
res
)),
3
)
#we expect the global temp table to disappear with new connection
#we expect the global temp table to disappear with new connection
import
time
print
(
"Sleeping 3 seconds..."
)
time
.
sleep
(
3
)
print
(
"Done sleeping"
)
with
eng
.
connect
()
as
conn
:
with
eng
.
connect
()
as
conn
:
print
(
"`eng.connect() as conn` finished executing"
)
with
self
.
assertRaises
(
Exception
)
as
e
:
with
self
.
assertRaises
(
Exception
)
as
e
:
res
=
conn
.
execute
(
'SELECT * FROM ##COHORT'
)
res
=
conn
.
execute
(
'SELECT * FROM ##COHORT'
)
print
(
e
.
exception
)
print
(
e
.
exception
)
# def test_raw_connection(self):
def
will_this_hang
(
self
):
# eng = get_clarity_engine()
eng
=
get_clarity_engine
(
host
=
'claritydev.uphs.upenn.edu'
)
# with eng.raw_connection().cursor() as cur:
print
(
"Try first connection as raw connection..."
)
# cur.execute('DROP TABLE IF EXISTS ##COHORT')
with
eng
.
raw_connection
()
.
cursor
()
as
cur
:
# cur.execute('SELECT TOP 3 PAT_ID INTO ##COHORT FROM PAT_ENC')
print
(
"connect executed"
)
# cur.execute('SELECT * FROM ##COHORT')
cur
.
execute
(
'DROP TABLE IF EXISTS ##COHORT'
)
# self.assertEqual(len([row for row in cur]), 3)
cur
.
execute
(
'SELECT TOP 3 PAT_ID INTO ##COHORT FROM PAT_ENC'
)
cur
.
execute
(
'SELECT * FROM ##COHORT'
)
self
.
assertEqual
(
len
([
row
for
row
in
cur
]),
3
)
print
(
"recreate engine"
)
eng
=
get_clarity_engine
(
host
=
'claritydev.uphs.upenn.edu'
,
timeout
=
60
)
#if I get rid of this line it doesn't hang
print
(
"Try second connection as regular connection..."
)
with
eng
.
connect
()
as
conn
:
print
(
"connect executed"
)
with
self
.
assertRaises
(
Exception
)
as
e
:
res
=
conn
.
execute
(
'SELECT * FROM ##COHORT'
)
#I think this is where it hangs? or not
print
(
e
.
exception
)
print
(
"Try third connection as regular connection..."
)
with
eng
.
connect
()
as
conn
:
print
(
"connect executed"
)
conn
.
execute
(
'DROP TABLE IF EXISTS ##COHORT'
)
conn
.
execute
(
'SELECT TOP 3 PAT_ID INTO ##COHORT FROM PAT_ENC'
)
res
=
conn
.
execute
(
'SELECT * FROM ##COHORT'
)
self
.
assertEqual
(
len
(
list
(
res
)),
3
)
#we expect the global temp table to disappear with new connection
if
__name__
==
'__main__'
:
def
test_raw_connection
(
self
):
eng
=
get_clarity_engine
(
host
=
'claritydev.uphs.upenn.edu'
)
with
eng
.
raw_connection
()
.
cursor
()
as
cur
:
cur
.
execute
(
'DROP TABLE IF EXISTS ##COHORT'
)
cur
.
execute
(
'SELECT TOP 3 PAT_ID INTO ##COHORT FROM PAT_ENC'
)
cur
.
execute
(
'SELECT * FROM ##COHORT'
)
self
.
assertEqual
(
len
([
row
for
row
in
cur
]),
3
)
unittest
.
main
()
#%%
if
__name__
==
'__main__'
:
# unittest.main()
t
=
TestStuff
()
t
.
test_basic_conn_execute
()
t
.
test_dev_conn_execute
()
t
.
test_temp_table_persistence
()
t
.
will_this_hang
()
t
.
test_raw_connection
()
clarity_to_csv.py
View file @
0003859
...
@@ -12,7 +12,7 @@ def get_mssql_engine(
...
@@ -12,7 +12,7 @@ def get_mssql_engine(
database
=
"clarity_snapshot_db"
,
database
=
"clarity_snapshot_db"
,
domain
=
"UPHS"
,
domain
=
"UPHS"
,
port
=
"1433"
,
port
=
"1433"
,
timeout
=
7200
,
timeout
=
600
,
#2hr?
password
=
None
,
password
=
None
,
):
):
from
sqlalchemy
import
create_engine
from
sqlalchemy
import
create_engine
...
@@ -27,13 +27,14 @@ def get_mssql_engine(
...
@@ -27,13 +27,14 @@ def get_mssql_engine(
#%% My functions
#%% My functions
selahcredsfilename
=
'C:
\\
Users
\\
LynchSe
\\
Documents
\\
selah_clarity_credentials.txt'
selahcredsfilename
=
'C:
\\
Users
\\
LynchSe
\\
Documents
\\
selah_clarity_credentials.txt'
def
get_clarity_engine
(
credsfilename
=
selahcredsfilename
,
timeout
=
600
):
def
get_clarity_engine
(
credsfilename
=
selahcredsfilename
,
timeout
=
600
,
host
=
'clarityprod.uphs.upenn.edu'
):
with
open
(
credsfilename
,
'r'
)
as
credsfile
:
with
open
(
credsfilename
,
'r'
)
as
credsfile
:
nameline
=
credsfile
.
readline
()
.
strip
()
nameline
=
credsfile
.
readline
()
.
strip
()
pwline
=
credsfile
.
readline
()
.
strip
()
pwline
=
credsfile
.
readline
()
.
strip
()
clarity_engine
=
get_mssql_engine
(
username
=
nameline
,
password
=
pwline
,
timeout
=
timeout
)
clarity_engine
=
get_mssql_engine
(
username
=
nameline
,
password
=
pwline
,
timeout
=
timeout
,
host
=
host
)
return
clarity_engine
return
clarity_engine
def
clarity_to_csv
(
sqlfilename
,
csvfilenames
,
dbconn
=
None
):
def
clarity_to_csv
(
sqlfilename
,
csvfilenames
,
dbconn
=
None
):
print
(
"Running SQL from {}"
.
format
(
sqlfilename
))
print
(
"Running SQL from {}"
.
format
(
sqlfilename
))
with
open
(
sqlfilename
,
'r'
)
as
sqlfile
:
with
open
(
sqlfilename
,
'r'
)
as
sqlfile
:
...
...
clarity_to_csv_test.py
View file @
0003859
...
@@ -6,8 +6,8 @@ import sqlparse
...
@@ -6,8 +6,8 @@ import sqlparse
#%%
#%%
testquerydir
=
'C:
\\
Users
\\
LynchSe
\\
Documents
\\
Repos
\\
Covid19Related
\\
selah
\\
clarity_to_csv_tests
\\
'
testquerydir
=
'C:
\\
Users
\\
LynchSe
\\
Documents
\\
Repos
\\
Covid19Related
\\
selah
\\
Clarity_Tools_Selah
\\
'
testdatadir
=
'C:
\\
Users
\\
LynchSe
\\
Documents
\\
Data
\\
clarity_to_csv_tests
\\
'
testdatadir
=
'C:
\\
Users
\\
LynchSe
\\
Documents
\\
Data
\\
Clarity_Tools_Selah
\\
'
#TODO spin up a sqlite database here
#TODO spin up a sqlite database here
...
...
sqlite_demo.py
0 → 100644
View file @
0003859
from
sqlalchemy
import
create_engine
#%%
sqlite_db_filepath
=
"C:
\\
Users
\\
LynchSe
\\
Documents
\\
Data
\\
database.db"
e
=
create_engine
(
'sqlite:///{}'
.
format
(
sqlite_db_filepath
))
c
=
e
.
connect
()
c
.
execute
(
'SELECT 1;'
)
c
.
execute
(
'CREATE TABLE IF NOT EXISTS COHORT (EMPI VARCHAR(90) NOT NULL);'
)
c
.
execute
(
"INSERT INTO COHORT (EMPI) values ('8001111117'),('1000000000'),('8333333002');"
)
res
=
c
.
execute
(
"SELECT EMPI FROM COHORT;"
)
#%%
for
line
in
res
:
print
(
type
(
line
))
print
(
line
)
#%%
c
.
close
()
test_template.py
0 → 100644
View file @
0003859
import
pandas
as
pd
import
unittest
class
TestStuff
(
unittest
.
TestCase
):
def
test_something
(
self
):
self
.
assertEqual
(
2
+
1
,
3
)
if
__name__
==
'__main__'
:
# unittest.main()
t
=
TestStuff
()
t
.
test_something
()
Write
Preview
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment