Commit 643d13e5 by Selah Clarity

sql to df function, sqlalchemy play

1 parent 73c201b8
...@@ -32,6 +32,9 @@ def get_clarity_engine(credsfilename = selahcredsfilename, timeout=600, host='cl ...@@ -32,6 +32,9 @@ def get_clarity_engine(credsfilename = selahcredsfilename, timeout=600, host='cl
class TestStuff(unittest.TestCase): class TestStuff(unittest.TestCase):
def test_clarity_dev_connection(self):
eng = get_clarity_engine(host='claritydev.uphs.upenn.edu')
#Test a basic connect and execute #Test a basic connect and execute
def test_basic_conn_execute(self): def test_basic_conn_execute(self):
eng = get_clarity_engine() eng = get_clarity_engine()
...@@ -116,6 +119,7 @@ if __name__ == '__main__': ...@@ -116,6 +119,7 @@ if __name__ == '__main__':
# unittest.main() # unittest.main()
t = TestStuff() t = TestStuff()
t.test_clarity_dev_connection()
t.test_basic_conn_execute() t.test_basic_conn_execute()
t.test_dev_conn_execute() t.test_dev_conn_execute()
t.test_temp_table_persistence() t.test_temp_table_persistence()
......
...@@ -35,6 +35,31 @@ def get_clarity_engine(credsfilename = selahcredsfilename, timeout=600, host='cl ...@@ -35,6 +35,31 @@ def get_clarity_engine(credsfilename = selahcredsfilename, timeout=600, host='cl
return clarity_engine return clarity_engine
def sqltext_to_dfs(sqltext, dbconn):
sql_stmts = extract_sql_statements(sqltext)
dfs = []
for stmt in sql_stmts:
res = dbconn.execute(stmt)
if res.returns_rows == True:
columns = list(res.keys())
values = []
for row in res: #note - there is some repeated logic here
values.append(list(row))
df = pd.DataFrame(data=values, columns=columns)
dfs.append(df)
return tuple(dfs)
def sqlfile_to_dfs(sqlfilename, dbconn):
with open(sqlfilename, 'r') as sqlfile:
sqltext = sqlfile.read()
dfs = sqltext_to_dfs(sqltext, dbconn)
return dfs
def clarity_to_csv(sqlfilename, csvfilenames, dbconn=None): def clarity_to_csv(sqlfilename, csvfilenames, dbconn=None):
print("Running SQL from {}".format(sqlfilename)) print("Running SQL from {}".format(sqlfilename))
import time import time
...@@ -53,14 +78,19 @@ def clarity_to_csv(sqlfilename, csvfilenames, dbconn=None): ...@@ -53,14 +78,19 @@ def clarity_to_csv(sqlfilename, csvfilenames, dbconn=None):
print("Query ran and exported {} in {:.1f} s".format(dtstr, duration)) print("Query ran and exported {} in {:.1f} s".format(dtstr, duration))
def extract_sql_statements(sqltext):
def clarity_to_csv_inner(sqltext, csvfilenames, sqalconn, verbose=False):
import sqlparse import sqlparse
sqltext_cleaned = sqlparse.format(sqltext, strip_comments=True).strip() sqltext_cleaned = sqlparse.format(sqltext, strip_comments=True).strip()
sqlstatements = sqltext_cleaned.split(';') sqlstatements = sqltext_cleaned.split(';')
sqlstatements = [stmt.strip() for stmt in sqlstatements]
if sqlstatements[-1].strip() == '': if sqlstatements[-1].strip() == '':
sqlstatements.pop() # often there is a final semicolon leading to a empty last statement sqlstatements.pop() # often there is a final semicolon leading to a empty last statement
return sqlstatements
def clarity_to_csv_inner(sqltext, csvfilenames, sqalconn, verbose=False):
sqlstatements = extract_sql_statements(sqltext)
which_statement = 0 which_statement = 0
which_csvfile = 0 which_csvfile = 0
for sqlstatement in sqlstatements: for sqlstatement in sqlstatements:
...@@ -77,7 +107,7 @@ def clarity_to_csv_inner(sqltext, csvfilenames, sqalconn, verbose=False): ...@@ -77,7 +107,7 @@ def clarity_to_csv_inner(sqltext, csvfilenames, sqalconn, verbose=False):
with open(csvname, 'w', newline='\n', encoding='utf-8') as csvfile: with open(csvname, 'w', newline='\n', encoding='utf-8') as csvfile:
line_count = 0 line_count = 0
mycsvwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) mycsvwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
#TODO - write column names #write column names
mycsvwriter.writerow(results.keys()) mycsvwriter.writerow(results.keys())
line_count += 1 line_count += 1
for row in results: for row in results:
......
...@@ -50,6 +50,21 @@ class TestStuff(unittest.TestCase): ...@@ -50,6 +50,21 @@ class TestStuff(unittest.TestCase):
ctc.clarity_to_csv(sqlfilename1, genericcsvs, dbconn=sqalconn) ctc.clarity_to_csv(sqlfilename1, genericcsvs, dbconn=sqalconn)
ctc.clarity_to_csv(sqlfilename2, genericcsvs, dbconn=sqalconn) ctc.clarity_to_csv(sqlfilename2, genericcsvs, dbconn=sqalconn)
def test_save_to_dataframes(self):
sqlfilename = testquerydir + "testCohort.sql"
with ctc.get_clarity_engine().connect() as sqalconn:
(df1, df2) = ctc.sqlfile_to_dfs(sqlfilename, sqalconn)
self.assertEqual(len(df1),3)
self.assertEqual(len(df2),2)
def test_save_to_df_2col(self):
sql_2col = '''SELECT TOP 10 PAT_ID, PAT_ENC_CSN_ID FROM PAT_ENC;
SELECT TOP 5 CONTACT_DATE FROM PAT_ENC tablesample(0.01);
'''
with ctc.get_clarity_engine().connect() as sqalconn:
(df1, df2) = ctc.sqltext_to_dfs(sql_2col, sqalconn)
self.assertEqual(len(df1),10)
self.assertEqual(len(df2),5)
def test_comment_with_semicolon(self): def test_comment_with_semicolon(self):
...@@ -78,8 +93,6 @@ class TestStuff(unittest.TestCase): ...@@ -78,8 +93,6 @@ class TestStuff(unittest.TestCase):
ctc.clarity_to_csv(sqlfilename, genericcsvs, dbconn=sqalconn) ctc.clarity_to_csv(sqlfilename, genericcsvs, dbconn=sqalconn)
def test_unicode_error(self): def test_unicode_error(self):
genericcsvs = [ genericcsvs = [
testdatadir + 'test_cohort.csv' testdatadir + 'test_cohort.csv'
...@@ -147,8 +160,10 @@ class TestStuff(unittest.TestCase): ...@@ -147,8 +160,10 @@ class TestStuff(unittest.TestCase):
if __name__ == '__main__': if __name__ == '__main__':
t = TestStuff() t = TestStuff()
t.test_save_to_dataframes()
t.test_save_to_df_2col()
# t.test_remove_file_not_there() # t.test_remove_file_not_there()
t.test_integration_test() # t.test_integration_test()
# t.test_comment_with_semicolon() # t.test_comment_with_semicolon()
# t.test_none_csv() # t.test_none_csv()
# t.test_unicode_error() # t.test_unicode_error()
...@@ -158,7 +173,7 @@ if __name__ == '__main__': ...@@ -158,7 +173,7 @@ if __name__ == '__main__':
# t.test_cohort() # t.test_cohort()
#unittest.main() # unittest.main()
......
select * from INFORMATION_SCHEMA.COLUMNS where table_name = 'CLARITY_MEDICATION'
\ No newline at end of file \ No newline at end of file
import sqlalchemy
import clarity_to_csv as ctc
#from sqlalchemy import create_engine
sqlalchemy.__version__
#%%
sqlite_db_filepath = "C:\\Users\\LynchSe\\Documents\\Data\\database.db"
e = sqlalchemy.create_engine('sqlite:///{}'.format(sqlite_db_filepath))
conn = e.connect()
#%%
eng_clarity = ctc.get_clarity_engine()
conn_clarity = eng_clarity.connect()
#%%
metadata_obj = sqlalchemy.MetaData()
#%%
mytable = sqlalchemy.schema.Table("mytable", metadata_obj,
sqlalchemy.Column('mytable_id', sqlalchemy.Integer, primary_key=True),
sqlalchemy.Column('value', sqlalchemy.String(50))
)
mytable.name
#%% HOLY SHIT, I CAN READ INFO ABOUT A TABLE FROM CLARITY!!!
pat_enc = sqlalchemy.Table('PAT_ENC', metadata_obj, autoload_with=eng_clarity)
list(pat_enc.columns)
#%% Can I easily import a cohort into clarity?
cohort = sqlalchemy.schema.Table("##cohort", metadata_obj,
sqlalchemy.Column('PAT_ID', sqlalchemy.String(18), primary_key=True))
#%%
cohort.create(eng_clarity)
#%%
stmt = sqlalchemy.insert(cohort).values(PAT_ID='5931')
eng_clarity.execute(stmt)
#%%
rows = [{'PAT_ID':'1132'}, {'PAT_ID':'1133'}]
stmt = sqlalchemy.insert(cohort).values(rows)
eng_clarity.execute(stmt)
#TODO - test if this can handle a gazillion rows without being super slow
#%%
stmt = sqlalchemy.select(cohort)
res = eng_clarity.execute(stmt)
#%%
import pandas as pd
df = pd.read_sql(stmt, eng_clarity)
df2 = pd.read_sql(stmt.where(cohort.c.PAT_ID < 2000), eng_clarity)
df3 = pd.read_sql(stmt.where(cohort.c.PAT_ID.in_([5931, 1132])), eng_clarity)
#%% now how do I select from PAT_ENC based on a list of ids
#TODO - import into temp table and use join
#TODO - compare with IN clause?
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!