/*------------------------------------------------------------------------------------------------------------ how_to_find_and_remove_duplicates_SQL_Server_0040.sql Removing duplicates in your database is a common DBA tasks. Normalization is supposed to keep your data clean and eliminate data redundancy. However truth of the matter is that many corporate database systems carry duplicates in one form or another. In this script we are going to first create duplicates in the AdventureWorks sample database. Next we are going to show you how to identify the duplicates by using two different methods. The two ways you can identify duplicates include: -GROUP BY clause -DISTINCT clause Finally we are going to show you one way to eliminate duplicates in the main transactional table using another reference table. ------------------------------------------------------------------------------------------------------------*/ use adventureworks --make a copy of the production table first drop table Person.CountryRegion_Has_Duplicates select * into Person.CountryRegion_Has_Duplicates from Person.CountryRegion --Create duplicates manually insert into Person.CountryRegion_Has_Duplicates select * from Person.CountryRegion where Name in ('Pakistan', 'India', 'Bangladesh') --Count all records in table that had duplicate records select count(*) from Person.CountryRegion_Has_Duplicates --241 --Count distinct (non-duplicate) records in a row --One way (preferred) is to use the Group BY clause select [CountryRegionCode] ,[Name] ,[ModifiedDate] from Person.CountryRegion_Has_Duplicates group by [CountryRegionCode] ,[Name] ,[ModifiedDate] --238 --Second way is to use the DISTINCT clause select distinct * from Person.CountryRegion_Has_Duplicates --238 --Find out which rows have duplicate values in them select [CountryRegionCode] ,[Name] ,[ModifiedDate] ,count(*) as NumRows from Person.CountryRegion_Has_Duplicates group by [CountryRegionCode] ,[Name] ,[ModifiedDate] having count(*) > 1 --Make a copy of duplicate rows in a new table, this will be used as a join later --in the SQL query to remove the duplicates drop table Person.CountryRegion_Duplicates select [CountryRegionCode] ,[Name] ,[ModifiedDate] into Person.CountryRegion_Duplicates from Person.CountryRegion_Has_Duplicates group by [CountryRegionCode] ,[Name] ,[ModifiedDate] having count(*) > 1 --3 rows select * from Person.CountryRegion_Duplicates --3 rows --Here we delete one row at a time using the ROWCOUNT command, this could possibly go in a loop --Note here we had to run it three times to remove the three duplicate entries set rowcount 1 delete b from Person.CountryRegion_Duplicates a join Person.CountryRegion_Has_Duplicates b on a.CountryRegionCode=a.CountryRegionCode and a.Name=b.Name set rowcount 0 --check the final table for duplicate rows select * from Person.CountryRegion_Has_Duplicates --238 rows, not 241! select [CountryRegionCode] ,[Name] ,[ModifiedDate] ,count(*) as NumRows from Person.CountryRegion_Has_Duplicates group by [CountryRegionCode] ,[Name] ,[ModifiedDate] having count(*) > 1 --0 rows no duplicates /* Compiled by http://www.learningsqlserver2008.com/, Kash Data Consulting LLC Disclaimer: All sample code and scripts are compiled by Kash Data Consulting LLC for illustrative purposes only. Kash Data Consulting LLC, therefore, cannot guarantee or imply reliability, or function of this code or scripts. All code contained herein are provided to you "AS IS" without any warranties of any kind. Please test all code and scripts in test environment before deployment in production systems. */