#!/usr/bin/env perl #--------------------------------------------------------------------- # file information #--------------------------------------------------------------------- # File: mssqlcopyunique.pl # Type: Strawberry Perl script for MS-Windows # Purpose: Fast-copy SQL table rows, omitting duplicates # License: Creative Commons BY-NC-SA 4.0 # https://creativecommons.org/licenses/by-nc-sa/4.0/ # Attribution: Robert Kiraly (OldCoder) # Revision: 160601 #--------------------------------------------------------------------- # important note #--------------------------------------------------------------------- # This software is provided on an AS IS basis with ABSOLUTELY NO WAR- # RANTY. The entire risk as to the quality and performance of the # software is with you. Should the software prove defective, you as- # sume the cost of all necessary servicing, repair or correction. In # no event will any of the developers, or any other party, be liable # to anyone for damages arising out of use of the software, or inabil- # ity to use the software. #--------------------------------------------------------------------- # documentation #--------------------------------------------------------------------- # 1. Overview: # This script copies rows from one SQL Server database table to ano- # ther, in the same database or a different one, omitting duplicate # rows. # In cases where there is a lot of duplication, this script may be # faster than de-dupe approaches implemented using only native SQL # queries. # This script can also be used to simplify tests of potential unique # indexes in cases where sample datasets contain duplicate rows. #--------------------------------------------------------------------- # 2. Assumptions: # There are three assumptions that may not be correct for particular # scenarios. If these assumptions are incorrect, changes to the code # will be needed: # # (a) SQL Server is set up for Windows Authentication. # (b) The source table has no row-number column that needs to be dis- # regarded. # (c) The data is text or numbers as opposed to binary blobs. #--------------------------------------------------------------------- # 3. Requirements: # Strawberry Perl with DBI and ODBC support. # SQL Server; releases 2012 and 2014 should work. # Windows 7 or a reasonably compatible release of Windows. # RAM sufficient to hold a de-duped copy of the source table. #--------------------------------------------------------------------- # 4. Usage: # a. Edit the program parameters section of this file. Set the parame- # ters in the section appropriately. # b. Use Object Explorer and/or MS-SQL queries to create the target # table. It should have the same structure as the source table, but # should be empty, initially. # c. If the assumptions listed in part 2 above are incorrect, modify # the code as necessary. # d. Run this file using Strawberry Perl. #--------------------------------------------------------------------- # module setup #--------------------------------------------------------------------- use strict; use Carp; use warnings; use DBI; # Trap warnings $SIG{__WARN__} = sub { die @_; }; #--------------------------------------------------------------------- # standard constants #--------------------------------------------------------------------- use constant ZERO => 0; use constant ONE => 1; use constant FALSE => 0; use constant TRUE => 1; #--------------------------------------------------------------------- # program parameters #--------------------------------------------------------------------- my $SERVER_SOURCE = 'SQLSERVER01' ; my $SERVER_DEST = 'SQLSERVER02' ; my $DATABASE_SOURCE = 'SaladSales' ; my $DATABASE_DEST = 'Vegetables' ; my $TABLE_SOURCE = 'ProduceAisle' ; my $TABLE_DEST = 'SaladBar' ; my $DBI_SETUP = 'DBI:ODBC:Driver={SQL Server}' ; my $LIST_COUNT = 500; # Update progress report every this- # many rows my $FETCH_COUNT = 10000; # Number of rows to fetch at a time #--------------------------------------------------------------------- # To use this script in its default mode, set $ERROR_ABORT to FALSE. # This script copies table rows, and attempts to skip duplicate rows, # regardless of the setting. # If $ERROR_ABORT is FALSE, this script also ignores errors that re- # sult from INSERT statements. Such errors are assumed, correctly or # incorrectly, to be cases where rows are already in the target table # and are triggering conflicts with unique indexes. # To use this script in an alternate mode, set $ERROR_ABORT to TRUE. # In this mode, if an INSERT statement results in an error, this # script prints an error message and aborts. # The second mode may be used to test potential unique indexes, which # must be added to the target table in advance, using data in the # source table as sample input. The primary feature that this script # offers in this context is that completely-redundant rows are omitted # from the test. my $ERROR_ABORT = FALSE; #--------------------------------------------------------------------- # initial setup #--------------------------------------------------------------------- # Source and destination SQL queries, database handles, and statement # handles. my ($query_source , $query_dest ); my ($dbh_source , $dbh_dest ); my ($sth_source , $sth_dest ); select STDERR; $| = ONE; # Set flush-on-write for two streams select STDOUT; $| = ONE; # Note: Order is significant here #--------------------------------------------------------------------- # connect to databases #--------------------------------------------------------------------- $dbh_source = 'DBI'->connect ("$DBI_SETUP;Server=$SERVER_SOURCE;Database=$DATABASE_SOURCE") or die $DBI::errstr . "\n"; $dbh_dest = 'DBI'->connect ("$DBI_SETUP;Server=$SERVER_DEST;Database=$DATABASE_DEST") or die $DBI::errstr . "\n"; #--------------------------------------------------------------------- # determine number of source columns #--------------------------------------------------------------------- $query_source = << "END"; SELECT COUNT(COLUMN_NAME) FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_CATALOG = '$DATABASE_SOURCE' AND TABLE_SCHEMA = 'dbo' AND TABLE_NAME = '$TABLE_SOURCE' END $sth_source = $dbh_source->prepare ($query_source) or die $DBI::errstr . "\n"; $sth_source->execute() or die $DBI::errstr . "\n"; my $data1 = $sth_source->fetchall_arrayref (undef, ONE) or die $DBI::errstr . "\n"; die unless defined ($data1) and ((ref $data1) eq 'ARRAY'); my ($row1 ) = @{$data1}; die unless defined ($row1 ) and ((ref $row1 ) eq 'ARRAY'); my ($numcols_source) = @$row1; $sth_source->finish(); die unless defined ($numcols_source) and ($numcols_source =~ m@^\d{1,4}\z@); print "Number of columns is $numcols_source\n"; #--------------------------------------------------------------------- # determine number of destination columns #--------------------------------------------------------------------- $query_dest = << "END"; SELECT COUNT(COLUMN_NAME) FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_CATALOG = '$DATABASE_DEST' AND TABLE_SCHEMA = 'dbo' AND TABLE_NAME = '$TABLE_DEST' END $sth_dest = $dbh_dest->prepare ($query_dest) or die $DBI::errstr . "\n"; $sth_dest->execute() or die $DBI::errstr . "\n"; my $data2 = $sth_dest->fetchall_arrayref (undef, ONE) or die $DBI::errstr . "\n"; die unless defined ($data2) and ((ref $data2) eq 'ARRAY'); my ($row2 ) = @{$data2}; die unless defined ($row2 ) and ((ref $row2 ) eq 'ARRAY'); my ($numcols_dest) = @$row2; $sth_dest->finish(); die unless defined ($numcols_dest) and ($numcols_dest =~ m@^\d{1,4}\z@); die unless $numcols_source == $numcols_dest; #--------------------------------------------------------------------- # access source rows #--------------------------------------------------------------------- $query_source = << "END"; USE [$DATABASE_SOURCE] SELECT * FROM [dbo].[$TABLE_SOURCE] END $sth_source = $dbh_source->prepare ($query_source) or die $DBI::errstr . "\n"; $sth_source->execute() or die $DBI::errstr . "\n"; #--------------------------------------------------------------------- # set up row-copy query #--------------------------------------------------------------------- my $cs = "?," x $numcols_source; $cs =~ s@,\z@@; $query_dest = << "END"; USE [$DATABASE_DEST] INSERT INTO [dbo].[$TABLE_DEST] VALUES($cs) END $sth_dest = $dbh_dest->prepare ($query_dest) or die $DBI::errstr . "\n"; $sth_dest -> {PrintError} = ZERO; #--------------------------------------------------------------------- # main loop #--------------------------------------------------------------------- my $NumRows = ZERO; # Number of rows processed my %Saw; # Keys are unique rows as strings while (my $data = $sth_source->fetchall_arrayref (undef, $FETCH_COUNT)) { print "Block of $FETCH_COUNT row(s) fetched\n"; $dbh_dest->begin_work(); for my $row ( @{$data} ) { # If you'd like to filter out rows in addition to duplicate rows, you # can do so here. For example, if you'd like to skip rows for which # the first column is undefined or blank, this code should work: # # my ($col1) = @$row; # next unless defined ($col1) && ($col1 =~ m@\S@); # The following block of code skips rows which are exact duplicates of # any row processed previously. # This version of the code assumes that there is no row-number column # that needs to be disregarded. If there is such a column, this code # should be modified so as to omit it from the hash-table key that is # created here. # This version of the code also assumes that data is text or numbers # as opposed to binary blobs. my @RowCopy = @$row; for my $ii (ZERO .. $#RowCopy) { $RowCopy [$ii] = "" unless defined $RowCopy [$ii]; } my $RowKey = join "\000", @RowCopy; next if defined $Saw {$RowKey}; $Saw {$RowKey} = ONE; if (!$sth_dest->execute (@$row)) { die $DBI::errstr . "\n" if $ERROR_ABORT; } ++$NumRows; printf STDOUT ('%10d' . "\n", $NumRows) if ($NumRows % $LIST_COUNT) == ZERO; } $dbh_dest->commit(); } #--------------------------------------------------------------------- # wrap it up #--------------------------------------------------------------------- $sth_source -> finish(); $sth_dest -> finish(); $dbh_source -> disconnect(); $dbh_dest -> disconnect(); exit ZERO;