library(readstata13)


#Range of years you want variables to encompass (INCLUSIVE!)
minyear<-1990
maxyear<-2014

#What is the LABEL of the year variable in Stata file?
yrlab_stata<-"year"

#################
############
#Code (no more inputs)
############
#################

#Directories
datadir<-getwd()
stopifnot(grepl("zm",datadir,fixed=T))
setwd("../SCRIPTS/1_variables")
scriptdir<-getwd()

#DO NOT CHANGE unless you know you need to!
write_new<-F

#############
#Get the variables of interest
#############
setwd(scriptdir)
varfiles<-list.files(pattern="libcons")

varDF.list<-list()
for(i in 1:length(varfiles)){
	ident<-gsub("gen\\_(.*?)\\_libcons.do","\\1",varfiles[i])
	varDF<-read.table(varfiles[i],sep="\n",stringsAsFactors=F)
	colnames(varDF)<-c("rawstring")
	varDF$isvar<-ifelse(grepl("replace.*?!=\\.",varDF$rawstring),1,0)
	varDF$cleanstring<-gsub("\t"," ",varDF$rawstring,fixed=T)
	
	if(write_new){
		#Reset the comment blocs. We'll comment out unusable ones at the end.
		varDF$cleanstring<-ifelse(varDF$isvar==1, 
							gsub("//","",varDF$cleanstring,fixed=T), 
							varDF$cleanstring)
	}	
	varDF$varname<-ifelse(varDF$isvar==1, gsub(".*?if\\s+(.*?)\\s*!=.+","\\1",varDF$cleanstring), NA)
	varDF$varname<-ifelse(nchar(varDF$varname)>32,substr(varDF$varname,1,32),varDF$varname)
	varDF$ident<-ident
	varDF.list[[ident]]<-varDF
}
varDF<-do.call(rbind,varDF.list)
rownames(varDF)<-1:nrow(varDF)
stopifnot(length(unique(varDF$ident))==length(varfiles))

################
#Get the dataset
################
setwd(datadir)
df.data<-read.dta13("dataset.dta",convert.factors=F)

stopifnot(yrlab_stata %in% colnames(df.data))
stopifnot(is.numeric(df.data[,yrlab_stata]))
stopifnot(all(grepl("\\d{4}",df.data[,yrlab_stata])))

###############
#Map the variables to the column in the data
###############
varDF$colexist<-ifelse(varDF$varname %in% colnames(df.data),1,0)
no_match<-varDF[varDF$colexist==0 & varDF$isvar==1,]

#Comment out rows with no matching column in the data
varDF[rownames(no_match),"cleanstring"]<-paste("//",varDF[rownames(no_match),"cleanstring"])

#Update variable names. We don't need them if they don't exist.
varDF$varname[varDF$colexist==0]<-NA

###############
#What is the range of years for each variable in the data?
#Define a function to check if a variable exist for the FULL range desired.
###############
if(F){
	var<-"abortion_partial_birth"
	sourcedf<-df.data
	yrlab<-yrlab_stata
	minyr<-minyear
	maxyr<-maxyear
}

checkyr<-function(var,sourcedf,yrlab,minyr,maxyr){
	if(is.na(var)){
		.out<-NA
		return(.out)
	}
	.subdf<-sourcedf[,c(yrlab,var)]
	.subdf<-.subdf[complete.cases(.subdf),]
	.checkdf<-.subdf[.subdf[,yrlab]>=minyr & .subdf[,yrlab]<=maxyr,]
	.yrvec<-sort(unique(.checkdf[,yrlab]))
	
	#If the var has no obs in year range!
	if(length(.yrvec)==0){ 
		.out<-0
		return(.out)
	}
	
	.diffvec<-abs(diff(.yrvec))
	.outBOOL<-(all(.diffvec==1) & min(.yrvec)==minyr & max(.yrvec)==maxyr)
	.out<-ifelse(.outBOOL,1,0)
	return(.out)
}

varDF$isfull<-sapply(varDF$varname,checkyr,df.data,yrlab_stata,minyear,maxyear,USE.NAMES=F)

#########
#Confirm that we've commented out all the unusable VARIABLES
#########
varDF$marked<-ifelse( (grepl("//",varDF$cleanstring,fixed=T) & varDF$isvar==1),1,0)
varDF$unusable<-0
varDF$unusable[varDF$colexist==0 | is.na(varDF$colexist)]<-1
varDF$unusable[varDF$isfull==0 | is.na(varDF$isfull)]<-1
varDF$unusable[varDF$isvar==0]<-0 #These aren't variables, so they're not marked.

if(all(varDF$unusable==varDF$marked)){
	print("Confirmed: all policies have correct range")
}else{
	warning("SOME POLICIES ARE NOT USABLE!!!")
}

###########################
##############
#No more relevant code past this point.
##############
###########################

#This code lets me write out a new version of the script IF we're writing out a new script.
#It will write to the data directory. It will refuse to write if the file already exists.
#Don't use this unless you know why you're using it and you meant to use it.

if(write_new){
	#########
	#Add back comment markers for vars with unusable data (if it exists but is not full)
	#########
	varDF$finstring<-varDF$cleanstring
	varDF$finstring<-ifelse(varDF$isfull==0 & varDF$colexist==1,
						paste("//",varDF$finstring),
						varDF$finstring
	)
	
	#########
	#Set up files to write out
	#########
	setwd(datadir)
	
	outDF<-varDF[,c("finstring","ident")]
	for(i in unique(outDF$ident)){
		.outfin<-outDF[outDF$ident==i,]
		.outfin<-rbind(.outfin,data.frame(finstring="////////////////",ident="not_applicable"))
		.outfilename<-paste0("gen_",i,"_libcons.do")
		if(.outfilename %in% varfiles){
			if(!.outfilename %in% list.files()){
				write.table(.outfin[,"finstring"],file=.outfilename,
							sep="\n",row.names=F,col.names=F,quote=F)
			}
		}
	}
}