clarity_to_csv.py
4.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from sqlalchemy import create_engine
from getpass import getpass
import pymssql
import csv
import pandas as pd
#%% Andrew functions
def get_mssql_engine(
username="lynchse",
host="clarityprod.uphs.upenn.edu",
database="clarity_snapshot_db",
domain="UPHS",
port="1433",
timeout=600, #2hr?
password=None,
):
from sqlalchemy import create_engine
if password is None:
password = getpass("PW: ")
user = domain + "\\" + username
return create_engine(
f"mssql+pymssql://{user}:{password}@{host}:{port}/{database}?timeout={timeout}"
)
#%% My functions
selahcredsfilename = 'C:\\Users\\LynchSe\\Documents\\selah_clarity_credentials.txt'
def get_clarity_engine(credsfilename = selahcredsfilename, timeout=600, host='clarityprod.uphs.upenn.edu'):
with open(credsfilename, 'r') as credsfile:
nameline = credsfile.readline().strip()
pwline = credsfile.readline().strip()
clarity_engine = get_mssql_engine(username=nameline, password=pwline, timeout=timeout, host=host)
return clarity_engine
def sqltext_to_dfs(sqltext, dbconn):
sql_stmts = extract_sql_statements(sqltext)
dfs = []
for stmt in sql_stmts:
res = dbconn.execute(stmt)
if res.returns_rows == True:
columns = list(res.keys())
values = []
for row in res: #note - there is some repeated logic here
values.append(list(row))
df = pd.DataFrame(data=values, columns=columns)
dfs.append(df)
return tuple(dfs)
def sqlfile_to_dfs(sqlfilename, dbconn):
with open(sqlfilename, 'r') as sqlfile:
sqltext = sqlfile.read()
dfs = sqltext_to_dfs(sqltext, dbconn)
return dfs
def clarity_to_csv(sqlfilename, csvfilenames, dbconn=None):
print("Running SQL from {}".format(sqlfilename))
import time
start = time.time()
with open(sqlfilename, 'r') as sqlfile:
sqltext = sqlfile.read()
eng = get_clarity_engine()
if dbconn:
clarity_to_csv_inner(sqltext, csvfilenames, dbconn)
else:
with eng.connect() as sqalconn:
clarity_to_csv_inner(sqltext, csvfilenames, sqalconn)
end = time.time()
duration = end - start
dtstr = time.strftime('%a %I:%M%p %Y-%m-%d')
print("Query ran and exported {} in {:.1f} s".format(dtstr, duration))
def extract_sql_statements(sqltext):
import sqlparse
sqltext_cleaned = sqlparse.format(sqltext, strip_comments=True).strip()
sqlstatements = sqltext_cleaned.split(';')
sqlstatements = [stmt.strip() for stmt in sqlstatements]
if sqlstatements[-1].strip() == '':
sqlstatements.pop() # often there is a final semicolon leading to a empty last statement
return sqlstatements
def clarity_to_csv_inner(sqltext, csvfilenames, sqalconn, verbose=False):
sqlstatements = extract_sql_statements(sqltext)
which_statement = 0
which_csvfile = 0
for sqlstatement in sqlstatements:
if verbose == True:
print("\n\nProcessing statement: '{}'\n".format(sqlstatement.strip()))
#TODO - time the query execution time
results = sqalconn.execute(sqlstatement) #this is hanging?
if results.returns_rows == True: #ONLY DO THIS IFFFF there are >0 results rows
csvname = csvfilenames[which_csvfile]
which_csvfile += 1
if csvname:
print("Writing CSV file {}".format(csvname))
#TODO - time the file writing time
with open(csvname, 'w', newline='\n', encoding='utf-8') as csvfile:
line_count = 0
mycsvwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
#write column names
mycsvwriter.writerow(results.keys())
line_count += 1
for row in results:
# print("Line {}".format(line_count))
mycsvwriter.writerow(row)
line_count += 1
print("Done writing CSV file!")
which_statement += 1