options ( java.parameters = "-Xmx14336m" ) ## memory set to 14 GB
library ( "XLConnect" )
library ( "sqldf" )
library ( "tcltk" )
startDate <- "2015-10-01"
endDate <- "2016-09-30"
df <- readWorksheetFromFile ( "JPS_Raw_Data.xlsx" , sheet = 1 , startRow = 2 )
#sampleVector <- sample(1:nrow(df), 30000)
#df2 <- df[sampleVector,]
#write.csv(df2, file="Sample of JPS_Raw_Data (30000).csv", na="")
#df3 <- read.csv("Sample of JPS_Raw_Data (30000).csv")
### Formatting ###################################
df3 [ is.na ( df3 )] <- ""
df3 $ Participant.Enterprise.Identifier <- gsub ( "-" , "" , df3 $ Participant.Enterprise.Identifier )
colnames ( df3 )[ 2 ] <- "peid"
colnames ( df3 )[ 5 ] <- "CaseNumber"
colnames ( df3 )[ 7 ] <- "Gender"
colnames ( df3 )[ 8 ] <- "Race"
colnames ( df3 )[ 9 ] <- "Ethnicity"
colnames ( df3 )[ 10 ] <- "ProgramName"
colnames ( df3 )[ 11 ] <- "SiteName"
colnames ( df3 )[ 12 ] <- "AgreesToShareOne"
colnames ( df3 )[ 13 ] <- "AgreesToShareTwo"
colnames ( df3 )[ 17 ] <- "ScanCardIssuedDate"
colnames ( df3 )[ 21 ] <- "ProgramStartDate"
df3 $ ProgramStartDate <- as.character ( df3 $ ProgramStartDate )
colnames ( df3 )[ 22 ] <- "ProgramEndDate"
df3 $ ProgramEndDate <- as.character ( df3 $ ProgramEndDate )
df3 $ ScanCardIssuedDate <- as.character ( df3 $ ScanCardIssuedDate )
colnames ( df3 )[ 19 ] <- "OutreachContactDate"
df3 $ OutreachContactDate <- as.character ( df3 $ OutreachContactDate )
##################################################
# Filter to only participants who agree to share information.
df4 <- sqldf ( "SELECT * FROM df3
WHERE (
AgreesToShareOne == 'Yes'
OR AgreesToShareTwo == 'Yes')
" )
##################################################
####### Start ####################################
####### Get Most Recent Scancard PEIDs ###########
##################################################
df5a <- sqldf ( "SELECT * FROM df4 WHERE ScanCardIssuedDate != ''" )
# Filter to Scan Card Creations (First time Homeless) or Return in Six Months
str2 <- paste ( "SELECT * FROM df5a WHERE ScanCardIssuedDate > '" , startDate , "' AND ScanCardIssuedDate < '" , endDate , "'" , sep = "" )
df5a <- sqldf ( str2 )
str <- paste ( "SELECT peid, MAX(ScanCardIssuedDate) AS 'MostRecentScanCardDate', 'Scan-card' As 'DateType', Value_1712 As 'Issuance Type', 'Scan-card' As 'ContactType' FROM df5a WHERE Value_1712 = 'Scan Card Creation (First time homeless clients)' OR Value_1712 = 'Scan Card Renewal (clients who return to the shelter after six months of being away)' GROUP BY peid ORDER BY MostRecentScanCardDate DESC" , sep = "" )
df5a <- sqldf ( str )
##################################################
####### End = df5a ###############################
####### Get Most Recent Scancard PEIDs ###########
##################################################
##################################################
####### Start ####################################
####### Get most recent Outreach Contact #########
##################################################
df5b <- sqldf ( "SELECT *
FROM df4
WHERE Outreach_Contact_2478 != ''
" )
str <- paste ( "SELECT peid, MAX(Outreach_Contact_2478) As 'MostRecentOutreachContact', 'OutreachContact' As 'DateType' FROM df5b WHERE Outreach_Contact_2478 > '" , startDate , "' AND Outreach_Contact_2478 < '" , endDate , "' GROUP BY peid ORDER BY MostRecentOutreachContact DESC" , sep = "" )
df5b <- sqldf ( str )
##################################################
####### End = df5a ###############################
####### Get most recent Outreach Contact #########
##################################################
##################################################
####### Start ####################################
####### Get most recent Program Enrollment #######
##################################################
#### 'CD PIT ES LTB' Program Group #######
# Emergency Youth Shelter
# ALS Emergency Shelter
# Employment Program
# Emergency Shelter
# PNS-Lowden Schutts Program for Women and Children
# PNS-Moving Home
# Veteran's Voice Shelter Based
# S.T.A.R.T
#### 'TH JPS Project' Program Group
# Families Together TH
# ARL.HA -Transitional Housing
# YWCA-TBLA 114 RRH
# CEC TH
# CEC- TBLA 114 Transitional Housing
# 3CP
# GRACE-Transitional Housing - TBLA 114
# NASH TH
# Liberty House TH
# MHMR-HS- TBLA 114 TH
# PNS-Veteran Transitional Living
# TC-TBLA 114 Transitional Housing TCCD
# SIMON
# The Salvation Army Mabee Center -- TBLA 114
#### Individual Program Groups
# ALS Emergency Shelter
# Emergency Shelter
df5c <- sqldf ( "SELECT *
FROM df4
WHERE ProgramName = 'Emergency Youth Shelter'
OR ProgramName = 'ALS Emergency Shelter'
OR ProgramName = 'Employment Program'
OR ProgramName = 'Emergency Shelter'
OR ProgramName = 'PNS-Lowden Schutts Program for Women and Children'
OR ProgramName = 'PNS-Moving Home'
OR ProgramName = 'Veteran''s Voice Shelter Based'
OR ProgramName = 'S.T.A.R.T'
OR ProgramName = 'YWCA-TBLA 114 RRH'
OR ProgramName = 'CEC TH'
OR ProgramName = 'CEC- TBLA 114 Transitional Housing'
OR ProgramName = '3CP'
OR ProgramName = 'GRACE-Transitional Housing - TBLA 114'
OR ProgramName = 'NASH TH'
OR ProgramName = 'Liberty House TH'
OR ProgramName = 'MHMR-HS- TBLA 114 TH'
OR ProgramName = 'PNS-Veteran Transitional Living'
OR ProgramName = 'TC-TBLA 114 Transitional Housing TCCD'
OR ProgramName = 'SIMON'
OR ProgramName = 'The Salvation Army Mabee Center -- TBLA 114'
OR ProgramName = 'ALS Emergency Shelter'
" )
# TODO: Fix ProgramEndDate to remove HH:MM:SS instead of hacking it.
df5c <- sqldf ( "SELECT *, date(ProgramEndDate) As ProgramEndDate2 FROM df5c" )
#df5c <- sqldf("SELECT * FROM df5c WHERE ProgramStartDate >= '2015-10-01'
# AND ( ProgramEndDate2 = ''
# OR ProgramEndDate2 >= '2015-01'
# )
# ")
df5c <- activeFilter ( df5c , 'ProgramStartDate' , 'ProgramEndDate2' , startDate , endDate )
df5c <- sqldf ( "SELECT peid, ProgramName, SiteName, MAX(ProgramStartDate) As 'MostRecentProgramStart', ProgramEndDate As 'MostRecentProgramEnd'
FROM df5c
GROUP BY peid
ORDER BY MostRecentProgramStart
" )
##################################################
####### Start ####################################
####### Aggregate Outreach, Scancard, Program ####
##################################################
df6 <- sqldf ( "SELECT * FROM df5a a LEFT JOIN df5b b ON a.peid=b.peid" )
df6 <- subset ( df6 )
df6 <- sqldf ( "SELECT * FROM df6 a LEFT JOIN df5c b ON a.peid=b.peid" )
df6 <- subset ( df6 )
df6 $ MostRecentProgramStart [ is.na ( df6 $ MostRecentProgramStart )] <- "1900-01-01"
df6 $ MostRecentOutreachContact [ is.na ( df6 $ MostRecentOutreachContact )] <- "1900-01-01"
df6 $ MostRecentScanCardDate [ is.na ( df6 $ MostRecentScanCardDate )] <- "1900-01-01"
#df6 <- sqldf(c("UPDATE df6 SET MostRecentScanCardDate = replace(MostRecentScanCardDate, '', '1900-01-01')", "SELECT * FROM df6"))
#df6 <- sqldf(c("UPDATE df6 SET MostRecentProgramStart = replace(MostRecentProgramStart, '', '1900-01-01')", "SELECT MostRecentProgramStart FROM df6"))
df6 <- sqldf ( "SELECT DISTINCT(peid),
CASE
WHEN MostRecentScanCardDate > MostRecentOutreachContact
AND MostRecentScanCardDate > MostRecentProgramStart
THEN MostRecentScanCardDate
WHEN MostRecentProgramStart > MostRecentOutreachContact
THEN MostRecentProgramStart
WHEN MostRecentOutreachContact = MAX(MostRecentScanCardDate, MostRecentProgramStart, MostRecentOutreachContact)
THEN MostRecentOutreachContact
ELSE 'Unknown'
END AS LastContactDate,
CASE
WHEN MostRecentScanCardDate > MostRecentOutreachContact
AND MostRecentScanCardDate > MostRecentProgramStart
THEN 'Scan Card Issuance'
WHEN MostRecentProgramStart > MostRecentOutreachContact
THEN 'Program Start Date'
WHEN MostRecentOutreachContact = MAX(MostRecentScanCardDate, MostRecentProgramStart, MostRecentOutreachContact)
THEN 'Outreach Contact'
ELSE 'Unknown'
END AS ContactDateType
FROM df6
" )
##################################################
####### End = df6 ################################
####### Aggregate Outreach, Scancard, Program ####
##################################################
##################################################
####### Start ####################################
####### Add Demographics #########################
##################################################
df7 <- sqldf ( "SELECT DISTINCT(peid), SSN, Name, CaseNumber, DOB, Gender, Race, Ethnicity
FROM df3
" )
df7 <- sqldf ( "SELECT a.*, b.SSN, b.Name, b.CaseNumber, b.DOB, b.Gender, b.Race, b.Ethnicity
FROM df6 a
INNER JOIN df7 b
ON a.peid=b.peid
" )
# activeRecords <- activeFilter(df, "occStartDate", "occEndDate", "2017-01-23", '2017-01-26')
activeFilter <- function ( df , beginDate , endDate , beginRange , endRange ){
df [ is.na ( df )] <- ""
str <- paste ( "SELECT * FROM df WHERE " , beginDate , " >= '" , beginRange , "' AND ( " , endDate , " = '' OR " , endDate , " >= '" , beginRange , "')" , sep = "" )
#print(str)
df <- sqldf ( str )
df
}