I just did a 10gb file. Wrote some simple python to read the file and parse each row. Inserted 114,000,000 rows into SQL Server express until it blew its 10gb database file limit. That was fun. Now I'm doing it with mySql, there is no file size limit. Here is some sample code. This isn't the exact code I ran but pretty close to it. I have spent some time on this getting it to run. it took 24hrs to insert 114,000,000 rows into the sql server express db on my shitty laptop.
def get_sqlserver_connection_string():
#Date: 30 August 2022
#Description Build SQL Server Connection string for Windows 10 DEV environment
#Change Later to load from config file
DRIVER_NAME = '{ODBC Driver 17 for SQL Server}'
SERVER_NAME = 'LAPTOP1' + '\\' + 'SQLEXPRESS'
print(SERVER_NAME)
DATABASE_NAME = 'DB'
#This is using Windows Authentication Mode
conn_string = ('DRIVER='+DRIVER_NAME+';SERVER='+SERVER_NAME+';DATABASE='+DATABASE_NAME+';TRUSTED_CONNECTION=YES')
return conn_string
# end get_sqlserver_connection_string
def insert_my_record(str_insert_statement):
#Date: 26-10-2022
#Description: execute a correctly fromatted sql string
#pip install https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/pyodbc/pyodbc-3.0.7.zip
#pip install --upgrade pyodbc );
#https://docs.microsoft.com/en-us/sql/connect/python/pyodbc/step-1-configure-development-environment-for-pyodbc-python-development?view=sql-server-ver16
#https://docs.microsoft.com/en-us/sql/connect/python/pyodbc/step-3-proof-of-concept-connecting-to-sql-using-pyodbc?view=sql-server-ver16
# import pyodbc
# # Some other example server values are
# # server = 'localhost\sqlexpress' # for a named instance
# # server = 'myserver,port' # to specify an alternate port
# server = 'tcp:myserver.database.windows.net'
# database = 'mydb'
# username = 'myusername'
# password = 'mypassword'
# cnxn = pyodbc.connect('DRIVER={ODBC Driver 18 for SQL Server};SERVER='+server+';DATABASE='+database+';UID='+username+';PWD='+ password)
# cursor = cnxn.cursor()
import sys
import pyodbc as odbc
import datetime
conn_string = get_sqlserver_connection_string()
try:
conn = odbc.connect(conn_string)
except Exception as e:
print(e)
print('error: task terminated')
sys.exit()
else:
cursor = conn.cursor()
try:
cursor.execute(str_insert_statement)
except Exception as e:
cursor.rollback()
print(e)
print('transaction rolled back')
else:
print('insert statement executed')
cursor.commit()
cursor.close()
finally:
#if conn.connected == 1:
# print('connection closed')
conn.close()
# end insert_record()
#Run it
import time
start_time = time.time()
#Open .sql file for read
filename = "1.1.sql"
isayso = True
current_row = 0
rows_per_insert = 10
rows_to_insert = 0
test_mode = True;
with open(filename, "r") as myfile:
while isayso:
#Let it insert strings with single quotes in them
while rows_to_insert < rows_per_insert:
current_row += 1
rows_to_insert += 1
strNewRow = "SET QUOTED_IDENTIFIER OFF"
strNewRow = strNewRow + "
"
strNewRow = strNewRow + myfile.readline().strip()
#Replace string
strNewRow = strNewRow.replace("idemail VALUES", "idemail (member_id, member_primary_email) VALUES")
#See sample email has an apostrophe in it. Lets handle that
#INSERT INTO idemail (member_id, member_primary_email) VALUES ('15628876', 'kimberly_o'
[email protected]');
strNewRow = strNewRow.replace("('", "(\"")
strNewRow = strNewRow.replace("',", "\",")
strNewRow = strNewRow.replace(" '", " \"")
strNewRow = strNewRow.replace(" '", " \"")
strNewRow = strNewRow.replace("')", "\")")
strNewRow = strNewRow + "
"
print(strNewRow)
#print(strNewRow)
#print('Row Count = :' + str(current_row))
if rows_to_insert == rows_per_insert:
print(strNewRow)
print('Row Count = :' + str(current_row))
# do the Insert
insert_my_record(strNewRow)
#if counter >= 1000000:
# print('Counter = ' + str(counter))
if test_mode:
#just do one lot
isayso = False
isayso = False
print("--- %s minutes ---" % ((time.time() - start_time)/60))