clarity_to_csv.py
4.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
from sqlalchemy import create_engine
from getpass import getpass
import pymssql
import csv
import pandas as pd
#%% Andrew functions
def get_mssql_engine(
username="lynchse",
host="clarityprod.uphs.upenn.edu",
database="clarity_snapshot_db",
domain="UPHS",
port="1433",
timeout=600, #2hr?
password=None,
):
from sqlalchemy import create_engine
if password is None:
password = getpass("PW: ")
user = domain + "\\" + username
return create_engine(
f"mssql+pymssql://{user}:{password}@{host}:{port}/{database}?timeout={timeout}"
)
#%% My functions
selahcredsfilename = 'C:\\Users\\LynchSe\\Documents\\selah_clarity_credentials.txt'
def get_clarity_engine(credsfilename = selahcredsfilename, **kwargs):
with open(credsfilename, 'r') as credsfile:
nameline = credsfile.readline().strip()
pwline = credsfile.readline().strip()
kwargs #this is a dict, how do I pass it to the following function
clarity_engine = get_mssql_engine(username=nameline, password=pwline, **kwargs)
return clarity_engine
def sqltext_to_dfs(sqltext, dbconn):
sql_stmts = extract_sql_statements(sqltext)
dfs = []
for stmt in sql_stmts:
res = dbconn.execute(stmt)
if res.returns_rows == True:
columns = list(res.keys())
values = []
for row in res: #note - there is some repeated logic here
values.append(list(row))
df = pd.DataFrame(data=values, columns=columns)
dfs.append(df)
return tuple(dfs)
def sqlfile_to_dfs(sqlfilename, dbconn):
print("Running SQL from {}".format(sqlfilename))
import time
start = time.time()
with open(sqlfilename, 'r') as sqlfile:
sqltext = sqlfile.read()
dfs = sqltext_to_dfs(sqltext, dbconn)
end = time.time()
duration = end - start
dtstr = time.strftime('%a %I:%M%p %Y-%m-%d')
print("Query ran and exported {} in {:.1f} s".format(dtstr, duration))
return dfs
def clarity_to_csv(sqlfilename, csvfilenames, dbconn=None):
print("Running SQL from {}".format(sqlfilename))
import time
start = time.time()
with open(sqlfilename, 'r') as sqlfile:
sqltext = sqlfile.read()
eng = get_clarity_engine()
if dbconn:
clarity_to_csv_inner(sqltext, csvfilenames, dbconn)
else:
with eng.connect() as sqalconn:
clarity_to_csv_inner(sqltext, csvfilenames, sqalconn)
end = time.time()
duration = end - start
dtstr = time.strftime('%a %I:%M%p %Y-%m-%d')
print("Query ran and exported {} in {:.1f} s".format(dtstr, duration))
def extract_sql_statements(sqltext):
import sqlparse
sqltext_cleaned = sqlparse.format(sqltext, strip_comments=True).strip()
sqlstatements = sqltext_cleaned.split(';')
sqlstatements = [stmt.strip() for stmt in sqlstatements]
if sqlstatements[-1].strip() == '':
sqlstatements.pop() # often there is a final semicolon leading to a empty last statement
return sqlstatements
def clarity_to_csv_inner(sqltext, csvfilenames, sqalconn, verbose=False):
sqlstatements = extract_sql_statements(sqltext)
which_statement = 0
which_csvfile = 0
for sqlstatement in sqlstatements:
if verbose == True:
print("\n\nProcessing statement: '{}'\n".format(sqlstatement.strip()))
#TODO - time the query execution time
results = sqalconn.execute(sqlstatement) #this is hanging?
if results.returns_rows == True: #ONLY DO THIS IFFFF there are >0 results rows
csvname = csvfilenames[which_csvfile]
which_csvfile += 1
if csvname:
print("Writing CSV file {}".format(csvname))
#TODO - time the file writing time
with open(csvname, 'w', newline='\n', encoding='utf-8') as csvfile:
line_count = 0
mycsvwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
#write column names
mycsvwriter.writerow(results.keys())
line_count += 1
for row in results:
# print("Line {}".format(line_count))
mycsvwriter.writerow(row)
line_count += 1
print("Done writing CSV file!")
which_statement += 1