Data Munging¶

Type of Data Munging¶

install.packages("dplyr", repos = "http://cran.us.r-project.org")

Installing package into 'C:/Users/byung/Documents/R/win-library/3.3'
(as 'lib' is unspecified)

package 'dplyr' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
	C:\Users\byung\AppData\Local\Temp\RtmpO4H1HS\downloaded_packages

Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

library(dplyr)

Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

a <- 1:5
b <- c(3.3,4,2.3,2.2,3.1)
c <- c(2,4,0,1,1.2)
d <- c(3.7,4,3.3,3.3,3.9)

exam1 <- data.frame(ID=a,Exam1=b,Exam2=c,Quiz=d)
exam1

write.csv(exam1,"exam1.csv")

Selects a subset of records based on a specified condition¶

filter(exam1, Exam2 >= 1 & Quiz < 3.9)

filter(exam1, Exam2 >=1, Quiz < 3.9)

Exam1과 Exam 둘 다 평균 이상인 학생은?

mean(exam1$Exam1)
mean(exam1$Exam2)
filter(exam1, Exam1 >= mean(Exam1), Exam2 >= mean(Exam2))

Appends records from multiple inputs¶

app <- c(6,3.5,1.5,3.5)
rbind(exam1,app)

변수 app의 값이 c(6,1)이라면?

app <- c(6,1)
rbind(exam1,app)

Reorders records according to the specified order criteria¶

arrange(exam1, Quiz) # arrange(exam1, desc(Quiz)) 내림차순

# Quiz와 Exam1 순서로 오름차순으로 정렬한다면?
arrange(exam1, Quiz,Exam1)

Selects a random sample¶

sample_n(exam1, 3) # Random sampling with a fixed number

sample_frac(exam1, 0.4) # Random sampling with a fixed fraction

exam1[as.logical((1:nrow(exam1))%%2),] # 1-in-n sampling

Summarizes information on groups of records¶

exam1$Gender <- c("남","여","남","여","남")

write.csv(exam1,"exam3.csv")

by_gender = group_by(exam1, Gender)
summarise(by_gender, exam1=mean(Exam1), quiz=median(Quiz))

exam1 %>% group_by(Gender) %>% summarise_each(funs(min,max), Exam1, Exam2, Quiz)

Includes records with distinct values in specified fields¶

filter(exam1,!duplicated(Quiz))

distinct(exam1, Quiz) # return unique values

Allows new fields to be generated based on existing fields¶

exam1 <- mutate(exam1, ExamSum=Exam1+Exam2, ExamMean=ExamSum/2)
exam1

Allows fields to be renamed or removed¶

select(exam1, ID:Exam2)

exam1 <- select(exam1, -ExamSum, -ExamMean)
exam1

Error in eval(expr, envir, enclos): 객체 'ExamSum'를 찾을 수 없습니다
Traceback:

1. select(exam1, -ExamSum, -ExamMean)
2. select_(.data, .dots = lazyeval::lazy_dots(...))
3. select_.data.frame(.data, .dots = lazyeval::lazy_dots(...))
4. select_vars_(names(.data), dots)
5. lazyeval::lazy_eval(args, names_list)
6. lapply(x, lazy_eval, data = data)
7. FUN(X[[i]], ...)
8. eval(x$expr, data, x$env)
9. eval(expr, envir, enclos)

rename(exam1, id=ID, quiz = Quiz, ex1=Exam1, ex2=Exam2)

Changes the sort order of fields¶

select(exam1, ID, Quiz, Exam1:Exam2)

Allows values in existing fields to be replaced by new values¶

exam1$Extra <- c(1, 1, NA, NA, 2)
exam1

exam1$Extra[is.na(exam1$Extra)] <- 0
exam1

Merges records from multiple inputs¶

a <- 1:5
b <- c(3.1,4,2.3,5.2,2.1)
c <- c(2,4,0,1,1.2)
d <- c(1.5,2,3.6,6.3,2.9)

exam2 <- data.frame(CID=a,Exam3=b,Exam4=c,FinalExam=d)

merge(exam1, exam2, by.x="ID", by.y="CID")

Transposes records to fields and fields to records¶

t(exam1)

Creates new fields from one or more categorical fields¶

Averaging values

tapply(exam1$Quiz,exam1$Gender,sum) # 범주형 변수에만 적용이 가능하다. apply는 Matrix에 적용.

Creates new fields from one or more categorical fields¶

Melting & Casting

a <- c(1,1,1,1,2,2,2)
b <- c('a','b','c','a','a','b','b')
c <- 1:7
d <- 7:1

tr <- data.frame(id=a,site=b,pageview=c,dwelltime=d)
tr

library(reshape)

tr.melt <- melt(tr, id.vars=c("id","site"),measure.vars=c("pageview","dwelltime")) # id.vars의 값들을 기준으로 measure.vars 값들을 펼침.
tr.melt

formular=var1~var2 : var1의 level을 행으로 var2의 level을 열 방향으로 설정해 value의 값을 function으로 집계

cast(tr.melt, id ~ site, sum, subset=variable=="pageview")

cast(tr.melt, id+site~variable, length)

cast(tr.melt, id ~ variable, mean, subset=variable=="pageview")

Converts numeric fields into discrete pieces¶

exam1 <- read.csv("exam1.csv")

exam1 <-mutate(exam1, ExamSum=Exam1+Exam2)

exam1$Level <-cut(exam1$ExamSum,breaks=3,labels=F)
exam1

exam1$Level <-cut(exam1$ExamSum,c(0,2,4,6,8),labels=F)
exam1

백화점 데이터를 통해 아래 문제를 해결¶

1. 50대 기혼 여성 고객리스트를 아래와 같이 출력 하시오¶

 custid gender age marriage    residence        job
 36  46111     여  57     기혼   Gangnam-gu   개인사업
 37  46230     여  57     기혼  Jungnang-gu   개인사업
 38  46327     여  58     기혼    Seocho-gu   금융기관
 39  46441     여  58     기혼   Gangnam-gu   교육기관
 40  46554     여  58     기혼 Seodaemun-gu 정보서비스
 41  46869     여  59     기혼    Seocho-gu     건설업

cs <- read.table("dataCustomers.tab", sep="\t", header = T, stringsAsFactors = F)
tr <- read.table("dataTransactions.tab", sep="\t", header = T, stringsAsFactors = F)
head(cs,3)
head(tr,3)

filter(cs, age >= 50 & age <= 59 & gender == "여" & marriage =="기혼") %>% tail()

2) H백화점의 남녀별 평균나이를 계산¶

cs %>% group_by(gender) %>% summarize(Cnt = n()) #남녀수
cs %>% group_by(gender) %>% summarize(age = mean(age)) #남녀 평균 나이
#group_by(cs,gender) %>% summarize(age=mean(age))

3) 거주지역 전체 출력.¶

head(distinct(cs,residence),3)
head(unique(cs$residence),3)

4) 지점별 수입품 과 국산품¶

head(tr,3)

tr %>% group_by(store,import) %>% summarise(Cnt=n())

5) 남녀별로 건당 구매액의 최소값 중앙 값 최대값을 계산하여 아래와 같이 출력하시오.¶

tmp <- merge(cs,tr)
head(tmp,3)

tmp %>% group_by(gender) %>% summarise_each(funs(min,median,max), amount)

6) 총 구매액이 가장 많은 사람부터 적은 사람순으로 정렬¶

tr %>% group_by(custid) %>% summarise(amount = sum(amount)) %>% arrange(desc(amount)) %>% head(3)

	X	ID	Exam1	Exam2	Quiz	ExamSum	Level
1	1.0	1.0	3.3	2.0	3.7	5.3	2.0
2	2	2	4	4	4	8	3
3	3.0	3.0	2.3	0.0	3.3	2.3	1.0
4	4.0	4.0	2.2	1.0	3.3	3.2	1.0
5	5.0	5.0	3.1	1.2	3.9	4.3	2.0

	X	ID	Exam1	Exam2	Quiz	ExamSum	Level
1	1.0	1.0	3.3	2.0	3.7	5.3	3.0
2	2	2	4	4	4	8	4
3	3.0	3.0	2.3	0.0	3.3	2.3	2.0
4	4.0	4.0	2.2	1.0	3.3	3.2	2.0
5	5.0	5.0	3.1	1.2	3.9	4.3	3.0

	custid	gender	age	marriage	residence	job
1	10070	여	28	미혼	Yongsan-gu	제조업
2	10139	여	28	미혼	Gangdong-gu	정보서비스
3	10208	여	28	미혼	Gwangjin-gu	제조업

	datetime	custid	store	product	brand	corner	import	amount	installment
1	2000-05-01 10:43	18313	신촌점	4104840008000	샤넬	화장품	1	113000	3
2	2000-05-01 11:00	18313	신촌점	2.7e+12	식품	일반식품	0	91950	3
3	2000-05-01 11:33	27222	신촌점	4545370944500	까사미아	가구	0	598000	3

	residence
1	Yongsan-gu
2	Gangdong-gu
3	Gwangjin-gu

	ID	Exam1	Exam2	Quiz
1	1.0	3.3	2.0	3.7
2	2	4	4	4
3	3.0	2.3	0.0	3.3
4	4.0	2.2	1.0	3.3
5	5.0	3.1	1.2	3.9
6	6.0	3.5	1.5	3.5

	ID	Exam1	Exam2	Quiz
1	1.0	3.3	2.0	3.7
2	2	4	4	4
3	3.0	2.3	0.0	3.3
4	4.0	2.2	1.0	3.3
5	5.0	3.1	1.2	3.9
6	6	1	6	1

	ID	Exam1	Exam2	Quiz	Gender	ExamSum	ExamMean
1	1	3.3	2	3.7	남	5.3	2.65
2	2	4	4	4	여	8	4
3	3	2.3	0	3.3	남	2.3	1.15
4	4	2.2	1	3.3	여	3.2	1.6
5	5	3.1	1.2	3.9	남	4.3	2.15

	ID	Exam1	Exam2	Quiz	Gender	Extra
1	1	3.3	2	3.7	남	1
2	2	4	4	4	여	1
3	3	2.3	0	3.3	남	NA
4	4	2.2	1	3.3	여	NA
5	5	3.1	1.2	3.9	남	2

	ID	Exam1	Exam2	Quiz	Gender	Extra
1	1	3.3	2	3.7	남	1
2	2	4	4	4	여	1
3	3	2.3	0	3.3	남	0
4	4	2.2	1	3.3	여	0
5	5	3.1	1.2	3.9	남	2

	ID	Exam1	Exam2	Quiz	Gender	Extra	Exam3	Exam4	FinalExam
1	1	3.3	2	3.7	남	1	3.1	2	1.5
2	2	4	4	4	여	1	4	4	2
3	3	2.3	0	3.3	남	0	2.3	0	3.6
4	4	2.2	1	3.3	여	0	5.2	1	6.3
5	5	3.1	1.2	3.9	남	2	2.1	1.2	2.9

	id	site	variable	value
1	1	a	pageview	1
2	1	b	pageview	2
3	1	c	pageview	3
4	1	a	pageview	4
5	2	a	pageview	5
6	2	b	pageview	6
7	2	b	pageview	7
8	1	a	dwelltime	7
9	1	b	dwelltime	6
10	1	c	dwelltime	5
11	1	a	dwelltime	4
12	2	a	dwelltime	3
13	2	b	dwelltime	2
14	2	b	dwelltime	1

	X	ID	Exam1	Exam2	Quiz	ExamSum	Level
1	1.0	1.0	3.3	2.0	3.7	5.3	2.0
2	2	2	4	4	4	8	3
3	3.0	3.0	2.3	0.0	3.3	2.3	1.0
4	4.0	4.0	2.2	1.0	3.3	3.2	1.0
5	5.0	5.0	3.1	1.2	3.9	4.3	2.0

	X	ID	Exam1	Exam2	Quiz	ExamSum	Level
1	1.0	1.0	3.3	2.0	3.7	5.3	3.0
2	2	2	4	4	4	8	4
3	3.0	3.0	2.3	0.0	3.3	2.3	2.0
4	4.0	4.0	2.2	1.0	3.3	3.2	2.0
5	5.0	5.0	3.1	1.2	3.9	4.3	3.0

	custid	gender	age	marriage	residence	job
36	46111	여	57	기혼	Gangnam-gu	개인사업
37	46230	여	57	기혼	Jungnang-gu	개인사업
38	46327	여	58	기혼	Seocho-gu	금융기관
39	46441	여	58	기혼	Gangnam-gu	교육기관
40	46554	여	58	기혼	Seodaemun-gu	정보서비스
41	46869	여	59	기혼	Seocho-gu	건설업

	gender	Cnt
1	남	154
2	여	346

	store	import	Cnt
1	무역점	0	4363
2	무역점	1	592
3	본점	0	3090
4	본점	1	632
5	신촌점	0	5622
6	신촌점	1	488
7	천호점	0	3988
8	천호점	1	350

	gender	min	median	max
1	남	650	51020	8000000
2	여	840	54354	3930000

	ID	Exam1	Exam2	Quiz
1	1.0	3.3	2.0	3.7
2	2	4	4	4
3	3.0	2.3	0.0	3.3
4	4.0	2.2	1.0	3.3
5	5.0	3.1	1.2	3.9
6	6.0	3.5	1.5	3.5

	ID	Exam1	Exam2	Quiz
1	1.0	3.3	2.0	3.7
2	2	4	4	4
3	3.0	2.3	0.0	3.3
4	4.0	2.2	1.0	3.3
5	5.0	3.1	1.2	3.9
6	6	1	6	1

	ID	Exam1	Exam2	Quiz	Gender	Extra
1	1	3.3	2	3.7	남	1
2	2	4	4	4	여	1
3	3	2.3	0	3.3	남	NA
4	4	2.2	1	3.3	여	NA
5	5	3.1	1.2	3.9	남	2

	ID	Exam1	Exam2	Quiz	Gender	Extra
1	1	3.3	2	3.7	남	1
2	2	4	4	4	여	1
3	3	2.3	0	3.3	남	0
4	4	2.2	1	3.3	여	0
5	5	3.1	1.2	3.9	남	2

	ID	Exam1	Exam2	Quiz	Gender	Extra	Exam3	Exam4	FinalExam
1	1	3.3	2	3.7	남	1	3.1	2	1.5
2	2	4	4	4	여	1	4	4	2
3	3	2.3	0	3.3	남	0	2.3	0	3.6
4	4	2.2	1	3.3	여	0	5.2	1	6.3
5	5	3.1	1.2	3.9	남	2	2.1	1.2	2.9

	X	ID	Exam1	Exam2	Quiz	ExamSum	Level
1	1.0	1.0	3.3	2.0	3.7	5.3	2.0
2	2	2	4	4	4	8	3
3	3.0	3.0	2.3	0.0	3.3	2.3	1.0
4	4.0	4.0	2.2	1.0	3.3	3.2	1.0
5	5.0	5.0	3.1	1.2	3.9	4.3	2.0

	X	ID	Exam1	Exam2	Quiz	ExamSum	Level
1	1.0	1.0	3.3	2.0	3.7	5.3	3.0
2	2	2	4	4	4	8	4
3	3.0	3.0	2.3	0.0	3.3	2.3	2.0
4	4.0	4.0	2.2	1.0	3.3	3.2	2.0
5	5.0	5.0	3.1	1.2	3.9	4.3	3.0

	ID	Exam1	Exam2	Quiz
1	1.0	3.3	2.0	3.7
2	2	4	4	4
3	3.0	2.3	0.0	3.3
4	4.0	2.2	1.0	3.3
5	5.0	3.1	1.2	3.9
6	6.0	3.5	1.5	3.5

	ID	Exam1	Exam2	Quiz
1	1.0	3.3	2.0	3.7
2	2	4	4	4
3	3.0	2.3	0.0	3.3
4	4.0	2.2	1.0	3.3
5	5.0	3.1	1.2	3.9
6	6	1	6	1

	ID	Exam1	Exam2	Quiz	Gender	Extra
1	1	3.3	2	3.7	남	1
2	2	4	4	4	여	1
3	3	2.3	0	3.3	남	NA
4	4	2.2	1	3.3	여	NA
5	5	3.1	1.2	3.9	남	2

	ID	Exam1	Exam2	Quiz	Gender	Extra
1	1	3.3	2	3.7	남	1
2	2	4	4	4	여	1
3	3	2.3	0	3.3	남	0
4	4	2.2	1	3.3	여	0
5	5	3.1	1.2	3.9	남	2

	ID	Exam1	Exam2	Quiz	Gender	Extra	Exam3	Exam4	FinalExam
1	1	3.3	2	3.7	남	1	3.1	2	1.5
2	2	4	4	4	여	1	4	4	2
3	3	2.3	0	3.3	남	0	2.3	0	3.6
4	4	2.2	1	3.3	여	0	5.2	1	6.3
5	5	3.1	1.2	3.9	남	2	2.1	1.2	2.9

	X	ID	Exam1	Exam2	Quiz	ExamSum	Level
1	1.0	1.0	3.3	2.0	3.7	5.3	2.0
2	2	2	4	4	4	8	3
3	3.0	3.0	2.3	0.0	3.3	2.3	1.0
4	4.0	4.0	2.2	1.0	3.3	3.2	1.0
5	5.0	5.0	3.1	1.2	3.9	4.3	2.0

	X	ID	Exam1	Exam2	Quiz	ExamSum	Level
1	1.0	1.0	3.3	2.0	3.7	5.3	3.0
2	2	2	4	4	4	8	4
3	3.0	3.0	2.3	0.0	3.3	2.3	2.0
4	4.0	4.0	2.2	1.0	3.3	3.2	2.0
5	5.0	5.0	3.1	1.2	3.9	4.3	3.0