#!/usr/bin/env perl

#---------------------------------------------------------------------
#                          file information
#---------------------------------------------------------------------

# File:        mssqlcopyunique.pl
# Type:        Strawberry Perl script for MS-Windows
# Purpose:     Fast-copy SQL table rows, omitting duplicates
# License:     Creative Commons BY-NC-SA 4.0
#              https://creativecommons.org/licenses/by-nc-sa/4.0/
# Attribution: Robert Kiraly (OldCoder)
# Revision:    160601

#---------------------------------------------------------------------
#                           important note
#---------------------------------------------------------------------

# This software is provided on an  AS IS basis with ABSOLUTELY NO WAR-
# RANTY.  The  entire risk as to the  quality and  performance of  the
# software is with you.  Should the software prove defective,  you as-
# sume the cost of all necessary  servicing, repair or correction.  In
# no event will any of the developers,  or any other party, be  liable
# to anyone for damages arising out of use of the software, or inabil-
# ity to use the software.

#---------------------------------------------------------------------
#                            documentation
#---------------------------------------------------------------------

# 1. Overview:

# This script  copies rows from one SQL Server database table to  ano-
# ther,  in the  same database or a different one,  omitting duplicate
# rows.

# In cases where  there is a lot of duplication,  this  script may  be
# faster  than  de-dupe approaches  implemented  using only native SQL
# queries.

# This script  can also be used to simplify tests of potential  unique
# indexes in cases where sample datasets contain duplicate rows.

#---------------------------------------------------------------------

# 2. Assumptions:

# There are three  assumptions that may not be correct for  particular
# scenarios.  If these assumptions are incorrect,  changes to the code
# will be needed:
#
# (a) SQL Server is set up for Windows Authentication.
# (b) The source table has no row-number column that  needs to be dis-
#     regarded.
# (c) The data is text or numbers as opposed to binary blobs.

#---------------------------------------------------------------------

# 3. Requirements:

# Strawberry Perl with DBI and ODBC support.
# SQL Server; releases 2012 and 2014 should work.
# Windows 7 or a reasonably compatible release of Windows.
# RAM sufficient to hold a de-duped copy of the source table.

#---------------------------------------------------------------------

# 4. Usage:

# a. Edit the program parameters section of this file. Set the parame-
# ters in the section appropriately.

# b. Use  Object Explorer and/or  MS-SQL queries to create  the target
# table.  It should  have the same structure as the source table,  but
# should be empty, initially.

# c. If the assumptions listed in part 2 above are  incorrect,  modify
# the code as necessary.

# d. Run this file using Strawberry Perl.

#---------------------------------------------------------------------
#                            module setup
#---------------------------------------------------------------------

use strict;
use Carp;
use warnings;
use DBI;
                                # Trap warnings
$SIG{__WARN__} = sub { die @_; };

#---------------------------------------------------------------------
#                         standard constants
#---------------------------------------------------------------------

use constant ZERO  => 0;
use constant ONE   => 1;

use constant FALSE => 0;
use constant TRUE  => 1;

#---------------------------------------------------------------------
#                         program parameters
#---------------------------------------------------------------------

my $SERVER_SOURCE   = 'SQLSERVER01'      ;
my $SERVER_DEST     = 'SQLSERVER02'      ;

my $DATABASE_SOURCE = 'SaladSales'       ;
my $DATABASE_DEST   = 'Vegetables'       ;

my $TABLE_SOURCE    = 'ProduceAisle'     ;
my $TABLE_DEST      = 'SaladBar'         ;

my $DBI_SETUP       = 'DBI:ODBC:Driver={SQL Server}' ;

my $LIST_COUNT      =   500;    # Update  progress report every  this-
                                # many rows
my $FETCH_COUNT     = 10000;    # Number of rows to fetch at a time

#---------------------------------------------------------------------

# To use this script in its default mode, set $ERROR_ABORT to FALSE.

# This script copies table rows,  and attempts to skip duplicate rows,
# regardless of the setting.

# If  $ERROR_ABORT is FALSE,  this script also ignores errors that re-
# sult  from INSERT statements.  Such errors are assumed, correctly or
# incorrectly,  to be cases where rows are already in the target table
# and are triggering conflicts with unique indexes.

# To use this script in an  alternate mode,  set $ERROR_ABORT to TRUE.
# In this mode,  if  an  INSERT statement  results  in an error,  this
# script prints an error message and aborts.

# The second mode may be used to test  potential unique indexes, which
# must  be  added to the target  table in advance,  using  data in the
# source table as sample input.  The primary feature that  this script
# offers in this context is that completely-redundant rows are omitted
# from the test.

my $ERROR_ABORT = FALSE;

#---------------------------------------------------------------------
#                            initial setup
#---------------------------------------------------------------------

# Source and destination SQL queries,  database handles, and statement
# handles.

my ($query_source , $query_dest );
my ($dbh_source   , $dbh_dest   );
my ($sth_source   , $sth_dest   );

select STDERR; $| = ONE;        # Set flush-on-write for two streams
select STDOUT; $| = ONE;        # Note: Order is significant here

#---------------------------------------------------------------------
#                        connect to databases
#---------------------------------------------------------------------

$dbh_source = 'DBI'->connect
    ("$DBI_SETUP;Server=$SERVER_SOURCE;Database=$DATABASE_SOURCE")
        or die $DBI::errstr . "\n";

$dbh_dest    = 'DBI'->connect
    ("$DBI_SETUP;Server=$SERVER_DEST;Database=$DATABASE_DEST")
        or die $DBI::errstr . "\n";

#---------------------------------------------------------------------
#                 determine number of source columns
#---------------------------------------------------------------------

$query_source = << "END";
SELECT COUNT(COLUMN_NAME) FROM INFORMATION_SCHEMA.COLUMNS WHERE
TABLE_CATALOG = '$DATABASE_SOURCE' AND TABLE_SCHEMA = 'dbo'
AND TABLE_NAME = '$TABLE_SOURCE'
END

$sth_source = $dbh_source->prepare ($query_source)
    or die $DBI::errstr . "\n";
$sth_source->execute()
    or die $DBI::errstr . "\n";

my $data1      = $sth_source->fetchall_arrayref (undef, ONE)
    or die $DBI::errstr . "\n";
die unless defined ($data1) and ((ref $data1) eq 'ARRAY');
my ($row1    ) = @{$data1};
die unless defined ($row1 ) and ((ref $row1 ) eq 'ARRAY');

my ($numcols_source) = @$row1;
$sth_source->finish();
die unless defined ($numcols_source)
    and ($numcols_source =~ m@^\d{1,4}\z@);
print "Number of columns is $numcols_source\n";

#---------------------------------------------------------------------
#               determine number of destination columns
#---------------------------------------------------------------------

$query_dest = << "END";
SELECT COUNT(COLUMN_NAME) FROM INFORMATION_SCHEMA.COLUMNS WHERE
TABLE_CATALOG = '$DATABASE_DEST' AND TABLE_SCHEMA = 'dbo'
AND TABLE_NAME = '$TABLE_DEST'
END

$sth_dest = $dbh_dest->prepare ($query_dest)
    or die $DBI::errstr . "\n";
$sth_dest->execute()
    or die $DBI::errstr . "\n";

my $data2      = $sth_dest->fetchall_arrayref (undef, ONE)
    or die $DBI::errstr . "\n";
die unless defined ($data2) and ((ref $data2) eq 'ARRAY');
my ($row2    ) = @{$data2};
die unless defined ($row2 ) and ((ref $row2 ) eq 'ARRAY');

my ($numcols_dest) = @$row2;
$sth_dest->finish();
die unless defined ($numcols_dest)
    and ($numcols_dest =~ m@^\d{1,4}\z@);
die unless $numcols_source == $numcols_dest;

#---------------------------------------------------------------------
#                         access source rows
#---------------------------------------------------------------------

$query_source = << "END";
USE [$DATABASE_SOURCE]
SELECT * FROM [dbo].[$TABLE_SOURCE]
END

$sth_source = $dbh_source->prepare ($query_source)
    or die $DBI::errstr . "\n";
$sth_source->execute()
    or die $DBI::errstr . "\n";

#---------------------------------------------------------------------
#                        set up row-copy query
#---------------------------------------------------------------------

my $cs =  "?," x $numcols_source;
   $cs =~ s@,\z@@;

$query_dest = << "END";
USE [$DATABASE_DEST]
INSERT INTO [dbo].[$TABLE_DEST] VALUES($cs)
END

$sth_dest = $dbh_dest->prepare ($query_dest)
    or die $DBI::errstr . "\n";
$sth_dest -> {PrintError} = ZERO;

#---------------------------------------------------------------------
#                              main loop
#---------------------------------------------------------------------

my $NumRows = ZERO;             # Number of rows processed
my %Saw;                        # Keys are unique rows as strings

while (my $data = $sth_source->fetchall_arrayref
    (undef, $FETCH_COUNT))
{
    print "Block of $FETCH_COUNT row(s) fetched\n";
    $dbh_dest->begin_work();

    for my $row ( @{$data} )
    {

# If you'd like to filter out rows in addition to duplicate rows,  you
# can  do so here.  For example,  if you'd like to skip rows for which
# the first column is undefined or blank, this code should work:
#
#       my ($col1) = @$row;
#       next unless defined ($col1) && ($col1 =~ m@\S@);

# The following block of code skips rows which are exact duplicates of
# any row processed previously.

# This version of the code assumes that there is no  row-number column
# that needs  to be disregarded.  If there is such a column, this code
# should be modified so as to omit it from the  hash-table key that is
# created here.

# This version of the code  also assumes that data is  text or numbers
# as opposed to binary blobs.

        my @RowCopy = @$row;
        for my $ii (ZERO .. $#RowCopy)
            { $RowCopy [$ii] = "" unless defined $RowCopy [$ii]; }

        my $RowKey = join "\000", @RowCopy;
        next if defined $Saw {$RowKey};
        $Saw {$RowKey} = ONE;

        if (!$sth_dest->execute (@$row))
        {
            die $DBI::errstr . "\n" if $ERROR_ABORT;
        }

        ++$NumRows;
        printf STDOUT ('%10d' . "\n", $NumRows)
            if ($NumRows % $LIST_COUNT) == ZERO;
    }

    $dbh_dest->commit();
}

#---------------------------------------------------------------------
#                             wrap it up
#---------------------------------------------------------------------

$sth_source -> finish();
$sth_dest   -> finish();

$dbh_source -> disconnect();
$dbh_dest   -> disconnect();

exit ZERO;