library(tidyverse)
library(nycflights13)
library(gridExtra)
head(flights) # 2013년 nyc에 이착률 비행기.
filter(flights, month==1, day==1) %>% head(2)
flights %>% filter(month==1,day==1) %>% head(2)
c(1,2,3) > 2
c(1,2,3) > 2 | c(4,5,6) <= 4 # 짝을 지어서 한다. 순서대로 1>2,4<=4 이런식으로 짝을 지어서
c(1,2,3) > 2 || c(4,5,6) <= 4 # 맨앞에꺼 하나만 가지고 한다.
filter(flights, month==11 | month==12) %>% head(2)
filter(flights, month %in% c(11,12)) %>% head(2) # 중의 연산자 : month가 11월 12월 중.
is.na(NA)
!is.na("123")
NA ^ 0 # -> 1 모든 수의 0 승은 1
NA | TRUE
FALSE & NA
## but
NA * 0 #NA....?
1. Find all flights that
1.1. Had an arrival delay of two or more hours
filter(flights, arr_delay>=120) %>% head(2)
1.2. Flew to Houston (IAH or HOU)
filter(flights, dest %in% c('IAH','HOU')) %>% head(2)
1.3. Were operated by United, American, or Delta
unique(flights$carrier)
filter(flights, carrier %in% c('UA','AA','DL')) %>% head(2)
1.4. Departed in summer (July, August, and September)
filter(flights, month %in% 7:9) %>% head(2)
filter(flights, between(month,7,9)) %>% head(2)
1.5. Arrived more than two hours late, but didn’t leave late
filter(flights, dep_delay<=0, arr_delay > 120) %>% head(2)
1.6. Were delayed by at least an hour, but made up over 30 minutes in flight
filter(flights, dep_delay >= 60, arr_delay < dep_delay - 30) %>% head(2)
1.7. Departed between midnight and 6am (inclusive)
filter(flights, dep_time <= 600) %>% head(2)
3. How many flights have a missing dep_time? What other variables are missing? What might these rows represent?
filter(flights,is.na(dep_time)) %>% head(2)
arrange(flights, arr_delay) %>% head(2)# 오름 차순
arrange(flights, desc(arr_delay)) %>% head(2) #내림 차순
select(flights, year, month,day) %>% head(2)
select(flights, year:day) %>% head(2)
select(flights, -(year:day)) %>% head(2)
select(flights, starts_with('dep')) %>% head(2)
select(flights, ends_with('delay')) %>% head(2)
select(flights, ends_with('delay'),starts_with('dep')) %>% head(2)
select(flights, contains('d')) %>% head(3)
select(flights, matches('d[aeiou]')) %>% head(2) # d뒤에 aeiou가 붙는 경우
rename(flights, tail_num = tailnum) %>% head(2)
select(flights, time_hour, air_time, everything()) %>% head(2)
flights_sml <- select(flights,
year:day,
ends_with("delay"),
distance,
air_time
)
mutate(flights_sml, gain=arr_delay - dep_delay, speed = distance / air_time * 60) %>% head(2)
arrange(filter(flights,month==1),day) %>% head(2)
flights %>% filter(month==1) %>% arrange(day) %>% head(2)
summarise(flights, delay=mean(dep_delay,na.rm=T))
by_day <- flights %>% group_by(year,month,day) %>% summarise(delay=mean(dep_delay,na.rm=T))
head(by_day)
flights %>% group_by(year,month,day) %>% summarise(delay=mean(dep_delay,na.rm=T), cnt=n()) %>% head(2)
options(repr.plot.height=3)
ggplot(data=diamonds) + geom_histogram(aes(x=carat))
ggplot(data=diamonds) + geom_histogram(aes(x=carat), binwidth = 0.01)
ggplot(data = mpg, mapping = aes(x = class, y = hwy)) +
geom_boxplot()
ggplot(data = mpg) +
geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy)) # reorder median을 기준으로
diamonds %>%
count(color, cut) %>%
ggplot(mapping = aes(x = color, y = cut)) +
geom_tile(mapping = aes(fill = n))
ggplot(data = diamonds) +
geom_point(mapping = aes(x = carat, y = price))
ggplot(data = diamonds) +
geom_point(mapping = aes(x = carat, y = price), alpha = 1 / 100)
library(hexbin)
gg1 <- ggplot(data = diamonds) +
geom_bin2d(mapping = aes(x = carat, y = price))
# install.packages("hexbin")
gg2 <- ggplot(data = diamonds) +
geom_hex(mapping = aes(x = carat, y = price))
grid.arrange(gg1,gg2,ncol=2)
ggplot(data = diamonds, mapping = aes(x = carat, y = price)) +
geom_boxplot(mapping = aes(group = cut_width(carat, 0.1)))
options(repr.plot.height=4)
ggplot(mpg,aes(class,hwy)) +
geom_boxplot(outlier.alpha = 0) + # Ouliter 가 보기 힘들다(점과 겹쳐서) outlier.alpha = 0 하면 그림을 없애준다.
geom_text(aes(label=rownames(mpg)))
ggplot(mpg,aes(class,hwy)) + geom_boxplot() # <- 활용
q1 = quantile(mpg$hwy,.25)
q3 = quantile(mpg$hwy,.75)
iqr = q3 - q1
upper = q3 + 1.5 * iqr
lower = q1 - 1.5 * iqr
filter(mpg, hwy>upper | hwy < lower)
filter.Outlier <- function(df){
q1 = quantile(df$hwy,.25)
q3 = quantile(df$hwy,.75)
iqr = q3 - q1
upper = q3 + 1.5 * iqr
lower = q1 - 1.5 * iqr
df %>% filter(hwy < lower | hwy > upper)
# R에서는 Return을 안하면 마지막행이 Return 된다.
}
mpg <- mpg %>% mutate(name=row_number())
mpg %>% filter.Outlier()
mpg %>% group_by(class) %>% do(head(.,1)) # 그룹바이한 class에 대해 각 df가 만들어지고 그게 .자리에 들어간다.
mpg %>% group_by(class) %>%
do(filter.Outlier((.)))
# do -> 그룹별로 함수 적용
# (짤라냈던 각각의 df를 . 자리가 각각 넣어서 한다. 그리고 결과를 합친다.)
x = mpg %>% group_by(class) %>% do(filter.Outlier(.))
ggplot(mpg, aes(class,hwy)) + geom_boxplot(outlier.alpha = 0) +
geom_text(data=x,aes(label=name))
filter.hwy <- function(df){
df %>% arrange(desc(hwy)) %>% head(3)
}
mpg %>% group_by(model) %>% do(filter.hwy(.))
mpg %>% group_by(model) %>% do((.) %>% arrange(desc(hwy)) %>% head(3)) # do((.)) do안에서 사용하려면 (.) 로 묶어줘야된다.
flights <- flights %>% mutate(name= row_number())
aa <- filter(flights, carrier == 'AA')
july <- filter(flights, month == 7)
i = setdiff(aa$name,july$name)
head(flights[i,])