Identifying specific differences between two datasets in Rstudio.

Photo by Jon Tyson on Unsplash
  1. Missing rows/columns
  2. Rows/columns which shouldn’t exist
  3. Incorrect values in any row
  4. Incorrect column headers
  5. Incorrect ordering of rows & columns
# Defining the variables
Migrated_Files_Dir<-"C:/Users/New_Docs/Dummy Files/Migrated_Files(BAD_FILES)"
Source_Files_Dir<-"C:/Users/New_Docs/Dummy Files/Source_Files(GOOD_FILES)"Data_Assessment_Save_Loc<-"C:/Users/New_Docs/Dummy Files"Detailed_Changes_Save_Loc<-"C:/Users/New_Docs/Dummy Files"Patched_Files_Save_Loc<-"C:/Users/New_Docs/Dummy Files"Scores_Save_Loc<-"C:/Users/New_Docs/Dummy Files"Mapping_Table_Loc<-"C:/Users/New_Docs/Dummy Files/Mapping_Folder/Mapping.csv"
The source file.
The source file.
The migrated table
The migrated file.
Mapping data.
#### START ####
library(daff)
library(htmltools)
library(readxl)
# Identify the migrated files & its corresponding source filesMigrated_Files <- list.files(path = Migrated_Files_Dir,recursive = TRUE)Source_Files <- list.files(path = Source_Files_Dir,recursive = TRUE)#Import the filessetwd(Migrated_Files_Dir)
Migrated_Files_Bundle<-lapply(Migrated_Files,function(i){
read.csv(i,header=TRUE,fileEncoding="UTF-8-BOM")
})
names(Migrated_Files_Bundle)<-Migrated_Files
setwd(Source_Files_Dir)
Source_Files_Bundle<-lapply(Source_Files,function(i){
read.csv(i,header=TRUE,fileEncoding="UTF-8-BOM")
})
names(Source_Files_Bundle)<-Source_Files

#Mapping Procedure
#Import the mapping tableMapping_Table<-read.csv(Mapping_Table_Loc,fileEncoding="UTF-8-BOM")####Conduct mapping procedure#####Convert mapping tableMapping_previous_names<-as.list(Mapping_Table$Previous_Name)
Mapping_new_names<-as.list(Mapping_Table$New_Name)
New_Migrated_Files_Bundle<-Migrated_Files_Bundle# Mapping the table names
k=1
for(i in names(Migrated_Files_Bundle)){
s<-1
for (j in Mapping_new_names){

if(i==j){

names(New_Migrated_Files_Bundle)[k]<-Mapping_previous_names[s]
break

}
s<-s+1

}
k<-k+1
}
#Import list objects into environments
my.env<-new.env()
list2env(New_Migrated_Files_Bundle,envir =my.env)#Remove unneeded objects
rm(Migrated_Files_Bundle)
rm(New_Migrated_Files_Bundle)
rm(Source_Files_Bundle)
rm(Mapping_Table)
rm(k)
rm(s)
rm(i)
rm(j)
#Mapping the column names
k<-1
m<-1
s<-1
for(i in names(my.env)){


for(i in colnames(my.env[[names(my.env)[k]]])){


for( j in Mapping_new_names){

if(colnames(my.env[[names(my.env)[k]]])[m]==Mapping_new_names[[s]]){

colnames(my.env[[names(my.env)[k]]])[m]<-Mapping_previous_names[[s]]
break

}
s<-s+1

}
s<-1
m<-m+1

}
m<-1
k<-k+1

}
# Mapping the remainder
k<-1
s<-1
n<-1
f<-1
for(i in names(my.env)){

for(a in colnames(my.env[[names(my.env)[k]]])){

for(b in my.env[[names(my.env)[k]]][,n]){

for( j in Mapping_new_names){

if(!is.na(my.env[[names(my.env)[k]]][,n][f])){

if(my.env[[names(my.env)[k]]][,n][f]==Mapping_new_names[[s]]){

my.env[[names(my.env)[k]]][,n][f]<-Mapping_previous_names[[s]]
break

}
s<-s+1
}

}
s<-1
f<-f+1

}
s<-1
f<-1
n<-n+1
}
k<-k+1
s<-1
n<-1
f<-1
}
######################################################################################################### Compute & save multiple data assessment filesfor(i in names(my.env)){
for( j in Source_Files){
if(i == j){
write.table(render_diff(diff_data(my.env[[i]],read.csv(paste(Source_Files_Dir,"\\",j,sep=""),fileEncoding="UTF-8-BOM"),Source_Files[Source_Files==j],
show_unchanged = TRUE,show_unchanged_columns = TRUE),title =paste("Changes_in_file",i) ),file =
paste(Data_Assessment_Save_Loc,"/Changes_in_",i,".html",sep = "")
,quote = FALSE,col.names = FALSE,
row.names = FALSE)

}
}
}
# Produce csv files that allows a user to browse through the changeslist_of_detailed_changes_files=list()
z<-1
for( i in names(my.env)){
for (j in Source_Files){

if(i == j){
detailed_changes<-diff_data(my.env[[i]],read.csv(paste(Source_Files_Dir,"\\",j,sep=""),fileEncoding="UTF-8-BOM"),show_unchanged = TRUE,show_unchanged_columns = TRUE)

write_diff(detailed_changes,file=paste(Detailed_Changes_Save_Loc,"\\New_",i,sep=""))

x<-read.csv(file=paste(Detailed_Changes_Save_Loc,"/New_",i,sep=""),fill = TRUE)

if(z<10^1000000000000000){
list_of_detailed_changes_files[[z]]<-x
names(list_of_detailed_changes_files)[z]<-i
z<-z+1
}



}
}
}
# Produce patched up files ( the corrected version of the file )for(i in names(my.env)){
for(j in Source_Files){

if(i==j){

patch<-diff_data(my.env[[i]],read.csv(paste(Source_Files_Dir,"\\",j,sep=""),fileEncoding="UTF-8-BOM"),show_unchanged = TRUE,show_unchanged_columns = TRUE)

patched_file<-patch_data(my.env[[i]],patch)

write.csv(patched_file,file = paste(Patched_Files_Save_Loc,"/Patched_",i,sep = ""),row.names = FALSE)

}

}

}
#Accuracy scores based on the number of unaltered rows/records.list_of_scores=list()
g<-0
for(i in list_of_detailed_changes_files){


score<-((nrow(subset(as.data.frame(i),as.data.frame(i)[,2]==""))/((nrow(as.data.frame(i)))-1)*100))

g<-g+1

list_of_scores=append(list_of_scores,paste("The percentage of rows that have remained nearly unchanged for",names(list_of_detailed_changes_files[g]),"is", round(score,2),"%" ))



}
write.table(list_of_scores,file=paste(Scores_Save_Loc,"/Scores_Migrated.csv",sep=""),row.names = FALSE,col.names = FALSE,sep="\n")#### END ####
#The other script's location should be specified in the below line
The_Other_Script<-source("C:\\Users\\New_Docs\\Final_Main_Script_V1.R")
First final output as a html file.
Here’s a pikachu and a squirtle to celebrate your success!

--

--

--

Highly enthusiastic about the field of data science and how it can transform the world by solving any dilemma ,regardless of the size and context of it . Skille

Love podcasts or audiobooks? Learn on the go with our new app.

Recommended from Medium

Big Data Analyzed Fast & Easy for Python Users

6block, focusing on ZK Computing, has officially launched its Mining Platform for Zero Knowledge…

Getting started with D3.js force simulations

Everyone of you already has the knowledge to profit from NFTs

Dog sitting in front of a computer and wearing glasses

Lining up Audio & Visual to Build Cutscenes in Unity

Next Level Todo-App: Using React, Typescript, Capacitor, and GCP.

Part 2 The Before and After Phase of Content I Proofread, Revised, Edited, and Rearranged for…

5 Big ways you can make money as a developer

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store
Harsh Ruchir Edirisinghe

Harsh Ruchir Edirisinghe

Highly enthusiastic about the field of data science and how it can transform the world by solving any dilemma ,regardless of the size and context of it . Skille

More from Medium

Giving Everyone Data Superpowers

Importance of quality data — A Review of “Data Cascades in High-Stakes AI” by Sambasivan et al.

BIG DATA AND BUSINESS INTELLIGENCE

3 Important Facts about Data Sources in Google Data Studio