# Intro to Data Manipulation with R

Hello World,

Here is a recorded version of an in-person training I have been doing.  Enjoy.  I end up coming back to this myself even for reference.

This episode is all about performing data manipulation to derive raw insights from your data using the R programming language.  Data manipulation is the core to anything and everything you do in business intelligence and machine learning.  This episode sets the base for all R based intelligence sessions from here on out.

Part 1: Introduction to Microsoft R Open.

Part 2: Introduction to R Data Structures

Part 3: Data Manipulation with R

Part 4: Beautiful Visualizations with R

Introduction
What is Data Manipulation
dplyr & magrittr
Install Packages
magrittr & pipe operator
reading in a .csv & cleaning names
Feature Manipulation
Observation Manipulation
Group Manipulation
Challenge Exersizes

```#########################
#Libraries/Dependencies#
########################
install.packages("magrittr")
install.packages("dplyr")
install.packages("checkpoint")
library("magrittr")
library("dplyr")
library("checkpoint")
checkpoint("2016-03-29")
#########################
#'%>%' The Pipe Operator#
#########################

mult = function(x,y){ x * y}
#2 * 6 * 5 * 10
mult(10, mult(5, mult(6, 2)))
#Piped Method
#output into first parameter of next function
2 %>%
mult(6) %>%
mult(5) %>%
mult(10)

###################
#Data In & Rename#
##################

sep=",")
m311
#Fix Formatting
m311 <- m311 %>% tbl_df()
#Summary of Data Set
summary(m311\$Case.Owner)
summary(m311)
#Renaming Part 1: Observations with Look Up Table (LUT)
caseOwnerLUT = c("Animal_Services" = "Animal Services",
"Public_Works_Construction-6-60" = "Construction",
"Public_Works_Mosquito_Control-8-60" = "Mosquito Control",
"Public_Works_Traffic_Engineering-10-60" = "Traffic Engineering",
"Public_Works_Waste_Management" = "Waste Management",
"Regulatory_and_Economic_Resources" = "Regulatory and Economic Resources",
"Community_Information_and_Outreach" = "Information and Outreach",
"Public_Works_Hwy_Engineering-5-60" = "Traffic Engineering",
"Public_Works_traffic_Signals_And_Signs-15-60" = "Traffic Engineering",
"RAAM-27-93" = "RAAM",
"Waste_Management" = "Waste Management"
)
m311\$Case.Owner = caseOwnerLUT[m311\$Case.Owner] %>%
factor()
summary(m311\$Case.Owner)
labels(m311)
#Renaming Part 2: Features
m311 = m311 %>%
rename(CreatedDate = Ticket.Created.Date...Time) %>%
rename(LastUpdatedDate = Ticket.Last.Updated.Date...Time) %>%
rename(ClosedDate = Ticket.Closed.Date...Time) %>%
rename(District = Neighborhood...District...Ward...etc.) %>%
rename(Zip = Zip...Postal.Code) %>%
rename(State = State...Province)

#Sampling
m311f = m311 %>% sample_frac(.1)
m311n = m311 %>% sample_n(1000)

#######################
#Feature Manipulation#
######################

#Select a few columns
m311 %>%
select(Case.Owner, CreatedDate)

#Exclude a column
m311 %>%
select(-ClosedDate)

#Select with helper
x = m311 %>%
select(contains("Issue"), Case.Owner)
x
distinct(x)

#Mutate
x = m311 %>%
mutate(
GoalKpi = (Goal.Days - Actual.Completed.Days) / Goal.Days
)
summary(x\$GoalKpi) #obviously some issues with this calculation, like divide by zero perhaps?

###########################
#Observation Manipulation#
##########################

goals = m311 %>%
select(Case.Owner, Issue.Type, Goal.Days, Actual.Completed.Days, Ticket.Status)

zeroGoal = goals %>%
filter(Goal.Days <= 0)
#how many are there?
nrow(zeroGoal)
summary(zeroGoal\$Ticket.Status)
#Lets reduce our factors a bit here.
zeroGoal\$Issue.Type = zeroGoal\$Issue.Type %>% factor()
zeroGoal\$Ticket.Status = factor(zeroGoal\$Ticket.Status)

zeroGoal %>%
arrange(desc(Actual.Completed.Days)) %>%
glimpse()

zeroGoal\$Actual.Completed.Days = as.numeric(zeroGoal\$Actual.Completed.Days)

#Check Out Summaries
zeroGoal %>%
summarise(
mean = mean(Actual.Completed.Days, na.rm = TRUE), #easy way
sd = sd(Actual.Completed.Days, na.rm = TRUE),
median = median(Actual.Completed.Days[!is.na(Actual.Completed.Days)]), #hard way
observations = n(),
NasInActualCompleted = sum(is.na(Actual.Completed.Days))
)

#####################
#Group Manipulation#
####################

x = zeroGoal %>%
group_by(Case.Owner)
class(x)
x
#looks like a dataframe

#but you can do grouped operations
zeroGoal %>%
group_by(Case.Owner) %>%
summarise(
obs = n(),
avg.comp.days = mean(Actual.Completed.Days, na.rm = TRUE),
max.comp.days = max(Actual.Completed.Days, na.rm = TRUE),
min.comp.days = min(Actual.Completed.Days, na.rm = TRUE)
) %>%
arrange(avg.comp.days)

#############################
#Practice Exersices - Part 1#
#############################
# For Each Year, What were #
# the top 3 Issues and #
# average resolution time #
############################

#Hints
install.packages("lubridate")
library("lubridate")
summary(m311\$CreatedDate)

m311f\$CreatedDate2 = as.Date(m311f\$CreatedDate,
format="%m/%d/%Y")

year(m311f\$CreatedDate2)

m311 %>%
select(Ticket.ID, Issue.Type, Actual.Completed.Days) %>%
top_n(3, Actual.Completed.Days) %>%
arrange(desc(Actual.Completed.Days))

##################
#################
m311 %>%
mutate(
Year = year(as.Date(CreatedDate, format="%m/%d/%Y"))
) %>%
group_by(Year, Issue.Type) %>%
summarise(
obs = n(),
avgResTime = mean(Actual.Completed.Days, na.rm = TRUE)
) %>%
top_n(3, obs) %>%
arrange(desc(obs))

#############################
#Practice Exersices - Part 2#
#############################
# What are top 3 methods we #
# we will receive gps #
#############################

##################
#################

m311 %>%
summarise(
gpsObs = sum((!is.na(Latitude) & !is.na(Longitude)))
) %>%
top_n(3, gpsObs) %>%
arrange(desc(gpsObs))

##############################
#Practice Exersices - Part 3 #
##############################
# Which Districts service #
# the most requests for each #
# department and what are #
# the top 3 issues #
##############################

##################
##################

m311 %>%
group_by(Case.Owner, District, Issue.Type) %>%
summarise(
issueObs = n()
) %>%
arrange(desc(issueObs)) %>%
summarise(
totalObs = sum(issueObs),
top_issues = toString(Issue.Type[1:3])
) %>%
top_n(3, totalObs) %>%
arrange(desc(totalObs)) %>% glimpse()

x[6,] %>% glimpse()```