JPS DSRIP Report V2.0

Reading time ~4 minutes

JPS DSRIP Report V2.0

options(java.parameters = "-Xmx14336m")  ## memory set to 14 GB
library("XLConnect")
library("sqldf")
library("tcltk")

startDate <- "2015-10-01"
endDate <- "2016-09-30"

df <- readWorksheetFromFile("JPS_Raw_Data.xlsx", sheet = 1, startRow = 2)

#sampleVector <- sample(1:nrow(df), 30000)
#df2 <- df[sampleVector,]

#write.csv(df2, file="Sample of JPS_Raw_Data (30000).csv", na="")

#df3 <- read.csv("Sample of JPS_Raw_Data (30000).csv")

### Formatting ###################################
df3[is.na(df3)] <- ""
df3$Participant.Enterprise.Identifier <- gsub("-", "", df3$Participant.Enterprise.Identifier)
colnames(df3)[2] <- "peid"
colnames(df3)[5] <- "CaseNumber"
colnames(df3)[7] <- "Gender"
colnames(df3)[8] <- "Race"
colnames(df3)[9] <- "Ethnicity"
colnames(df3)[10] <- "ProgramName"
colnames(df3)[11] <- "SiteName"
colnames(df3)[12] <- "AgreesToShareOne"
colnames(df3)[13] <- "AgreesToShareTwo"
colnames(df3)[17] <- "ScanCardIssuedDate"
colnames(df3)[21] <- "ProgramStartDate"
df3$ProgramStartDate <- as.character(df3$ProgramStartDate)
colnames(df3)[22] <- "ProgramEndDate"
df3$ProgramEndDate <- as.character(df3$ProgramEndDate)
df3$ScanCardIssuedDate <- as.character(df3$ScanCardIssuedDate)
colnames(df3)[19] <- "OutreachContactDate"
df3$OutreachContactDate <- as.character(df3$OutreachContactDate)

##################################################

# Filter to only participants who agree to share information.
df4 <- sqldf("SELECT * FROM df3 
              WHERE (
                 AgreesToShareOne == 'Yes' 
                OR AgreesToShareTwo == 'Yes')
             ")

##################################################
####### Start ####################################
####### Get Most Recent Scancard PEIDs ###########
##################################################

df5a <- sqldf("SELECT * FROM df4 WHERE ScanCardIssuedDate != ''")

# Filter to Scan Card Creations (First time Homeless) or Return in Six Months
str2 <- paste("SELECT * FROM df5a WHERE ScanCardIssuedDate > '", startDate, "' AND ScanCardIssuedDate < '", endDate, "'", sep="")
df5a <- sqldf(str2)

str <- paste("SELECT peid, MAX(ScanCardIssuedDate) AS 'MostRecentScanCardDate', 'Scan-card' As 'DateType', Value_1712 As 'Issuance Type', 'Scan-card' As 'ContactType' FROM df5a WHERE Value_1712 = 'Scan Card Creation (First time homeless clients)' OR  Value_1712 = 'Scan Card Renewal (clients who return to the shelter after six months of being away)' GROUP BY peid ORDER BY MostRecentScanCardDate DESC", sep = "")

df5a <- sqldf(str)
##################################################
####### End = df5a ###############################
####### Get Most Recent Scancard PEIDs ###########
##################################################

##################################################
####### Start ####################################
####### Get most recent Outreach Contact #########
##################################################

df5b <- sqldf("SELECT *
              FROM df4 
              WHERE Outreach_Contact_2478 != ''
              ")

str <- paste("SELECT peid, MAX(Outreach_Contact_2478) As 'MostRecentOutreachContact', 'OutreachContact' As 'DateType' FROM df5b WHERE Outreach_Contact_2478 > '", startDate, "' AND Outreach_Contact_2478  < '", endDate, "' GROUP BY peid ORDER BY MostRecentOutreachContact DESC", sep = "")

df5b <- sqldf(str)
##################################################
####### End = df5a ###############################
####### Get most recent Outreach Contact #########
##################################################

##################################################
####### Start ####################################
####### Get most recent Program Enrollment #######
##################################################

#### 'CD PIT ES LTB' Program Group #######
  # Emergency Youth Shelter
  # ALS Emergency Shelter
  # Employment Program
  # Emergency Shelter
  # PNS-Lowden Schutts Program for Women and Children
  # PNS-Moving Home
  # Veteran's Voice Shelter Based
  # S.T.A.R.T

#### 'TH JPS Project' Program Group
  # Families Together TH
  # ARL.HA -Transitional Housing
  # YWCA-TBLA 114 RRH
  # CEC TH
  # CEC- TBLA 114 Transitional Housing
  # 3CP
  # GRACE-Transitional Housing - TBLA 114
  # NASH TH
  # Liberty House TH
  # MHMR-HS- TBLA 114 TH
  # PNS-Veteran Transitional Living
  # TC-TBLA 114 Transitional Housing TCCD
  # SIMON
  # The Salvation Army Mabee Center -- TBLA 114

#### Individual Program Groups
  # ALS Emergency Shelter
  # Emergency Shelter

df5c <- sqldf("SELECT * 
              FROM df4
              WHERE ProgramName = 'Emergency Youth Shelter'
                 OR ProgramName = 'ALS Emergency Shelter'
                 OR ProgramName = 'Employment Program'
                 OR ProgramName = 'Emergency Shelter'
                 OR ProgramName = 'PNS-Lowden Schutts Program for Women and Children'
                 OR ProgramName = 'PNS-Moving Home'
                 OR ProgramName = 'Veteran''s Voice Shelter Based'
                 OR ProgramName = 'S.T.A.R.T'

                OR ProgramName = 'YWCA-TBLA 114 RRH'
                OR ProgramName = 'CEC TH'
                OR ProgramName = 'CEC- TBLA 114 Transitional Housing'
                OR ProgramName = '3CP'
                OR ProgramName = 'GRACE-Transitional Housing - TBLA 114'
                OR ProgramName = 'NASH TH'
                OR ProgramName = 'Liberty House TH'
                OR ProgramName = 'MHMR-HS- TBLA 114 TH'
                OR ProgramName = 'PNS-Veteran Transitional Living'
                OR ProgramName = 'TC-TBLA 114 Transitional Housing TCCD'
                OR ProgramName = 'SIMON'
                OR ProgramName = 'The Salvation Army Mabee Center -- TBLA 114'

                OR ProgramName = 'ALS Emergency Shelter'
              ")

# TODO: Fix ProgramEndDate to remove HH:MM:SS instead of hacking it.
df5c <- sqldf("SELECT *, date(ProgramEndDate) As ProgramEndDate2 FROM df5c")

#df5c <- sqldf("SELECT * FROM df5c WHERE ProgramStartDate >= '2015-10-01'
#                                  AND ( ProgramEndDate2 = ''
#                                        OR ProgramEndDate2 >= '2015-01'
#                                      ) 
#              ")

df5c <- activeFilter(df5c, 'ProgramStartDate', 'ProgramEndDate2', startDate, endDate)

df5c <- sqldf("SELECT peid, ProgramName, SiteName, MAX(ProgramStartDate) As 'MostRecentProgramStart', ProgramEndDate As 'MostRecentProgramEnd' 
              FROM df5c
              GROUP BY peid
              ORDER BY MostRecentProgramStart
              ")

##################################################
####### Start ####################################
####### Aggregate Outreach, Scancard, Program ####
##################################################

df6 <- sqldf("SELECT * FROM df5a a LEFT JOIN df5b b ON a.peid=b.peid")
df6 <- subset(df6)
df6 <- sqldf("SELECT * FROM df6 a LEFT JOIN df5c b ON a.peid=b.peid")
df6 <- subset(df6)

df6$MostRecentProgramStart[is.na(df6$MostRecentProgramStart)] <- "1900-01-01"
df6$MostRecentOutreachContact[is.na(df6$MostRecentOutreachContact)] <- "1900-01-01"
df6$MostRecentScanCardDate[is.na(df6$MostRecentScanCardDate)] <- "1900-01-01"

#df6 <- sqldf(c("UPDATE df6 SET MostRecentScanCardDate = replace(MostRecentScanCardDate, '', '1900-01-01')", "SELECT * FROM df6"))
#df6 <- sqldf(c("UPDATE df6 SET MostRecentProgramStart = replace(MostRecentProgramStart, '', '1900-01-01')", "SELECT MostRecentProgramStart FROM df6"))

df6 <- sqldf("SELECT DISTINCT(peid), 
             CASE 
                WHEN MostRecentScanCardDate > MostRecentOutreachContact
                AND MostRecentScanCardDate > MostRecentProgramStart
              THEN MostRecentScanCardDate 
                WHEN MostRecentProgramStart > MostRecentOutreachContact
              THEN MostRecentProgramStart
                WHEN MostRecentOutreachContact = MAX(MostRecentScanCardDate, MostRecentProgramStart, MostRecentOutreachContact)
              THEN MostRecentOutreachContact
              ELSE 'Unknown'
              END AS LastContactDate, 
             CASE 
                WHEN MostRecentScanCardDate > MostRecentOutreachContact
                AND MostRecentScanCardDate > MostRecentProgramStart
             THEN 'Scan Card Issuance' 
                WHEN MostRecentProgramStart > MostRecentOutreachContact
             THEN 'Program Start Date'
                WHEN MostRecentOutreachContact = MAX(MostRecentScanCardDate, MostRecentProgramStart, MostRecentOutreachContact)
             THEN 'Outreach Contact'
             ELSE 'Unknown'
             END AS ContactDateType 
             FROM df6
             ")

##################################################
####### End = df6 ################################
####### Aggregate Outreach, Scancard, Program ####
##################################################

##################################################
####### Start ####################################
####### Add Demographics #########################
##################################################

df7 <- sqldf("SELECT DISTINCT(peid), SSN, Name, CaseNumber, DOB, Gender, Race, Ethnicity
             FROM df3
             ")

df7 <- sqldf("SELECT a.*, b.SSN, b.Name, b.CaseNumber, b.DOB, b.Gender, b.Race, b.Ethnicity
             FROM df6 a
             INNER JOIN df7 b
             ON a.peid=b.peid
             ")

# activeRecords <- activeFilter(df, "occStartDate", "occEndDate", "2017-01-23", '2017-01-26')
activeFilter <- function(df, beginDate, endDate, beginRange, endRange){
  df[is.na(df)] <- ""
  str <- paste("SELECT * FROM df WHERE ", beginDate, " >= '", beginRange, "' AND ( ", endDate, " = '' OR ", endDate, " >= '", beginRange, "')", sep = "")
  #print(str)
  df <- sqldf(str)
  df
}

What is a Data Warehouse

## Insights over DataData. They are the plastic of the tech world. We're are making way too much of it, you can't seem to get rid of it, ...… Continue reading