Data Munging

Type of Data Munging

In [1]:
install.packages("dplyr", repos = "http://cran.us.r-project.org")
Installing package into 'C:/Users/byung/Documents/R/win-library/3.3'
(as 'lib' is unspecified)
package 'dplyr' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
	C:\Users\byung\AppData\Local\Temp\RtmpO4H1HS\downloaded_packages
Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

In [1]:
library(dplyr)
Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

In [11]:
a <- 1:5
b <- c(3.3,4,2.3,2.2,3.1)
c <- c(2,4,0,1,1.2)
d <- c(3.7,4,3.3,3.3,3.9)

exam1 <- data.frame(ID=a,Exam1=b,Exam2=c,Quiz=d)
exam1

write.csv(exam1,"exam1.csv")
IDExam1Exam2Quiz
11.03.32.03.7
22444
33.02.30.03.3
44.02.21.03.3
55.03.11.23.9

Selects a subset of records based on a specified condition

In [5]:
filter(exam1, Exam2 >= 1 & Quiz < 3.9)
IDExam1Exam2Quiz
11.03.32.03.7
24.02.21.03.3
In [6]:
filter(exam1, Exam2 >=1, Quiz < 3.9)
IDExam1Exam2Quiz
11.03.32.03.7
24.02.21.03.3
  • Exam1과 Exam 둘 다 평균 이상인 학생은?
In [9]:
mean(exam1$Exam1)
mean(exam1$Exam2)
filter(exam1, Exam1 >= mean(Exam1), Exam2 >= mean(Exam2))
2.98
1.64
IDExam1Exam2Quiz
11.03.32.03.7
22444

Appends records from multiple inputs

In [13]:
app <- c(6,3.5,1.5,3.5)
rbind(exam1,app)
IDExam1Exam2Quiz
11.03.32.03.7
22444
33.02.30.03.3
44.02.21.03.3
55.03.11.23.9
66.03.51.53.5
  • 변수 app의 값이 c(6,1)이라면?
In [14]:
app <- c(6,1)
rbind(exam1,app)
IDExam1Exam2Quiz
11.03.32.03.7
22444
33.02.30.03.3
44.02.21.03.3
55.03.11.23.9
66161

Reorders records according to the specified order criteria

In [16]:
arrange(exam1, Quiz) # arrange(exam1, desc(Quiz)) 내림차순
IDExam1Exam2Quiz
13.02.30.03.3
24.02.21.03.3
31.03.32.03.7
45.03.11.23.9
52444
In [19]:
# Quiz와 Exam1 순서로 오름차순으로 정렬한다면?
arrange(exam1, Quiz,Exam1)
IDExam1Exam2Quiz
14.02.21.03.3
23.02.30.03.3
31.03.32.03.7
45.03.11.23.9
52444

Selects a random sample

In [22]:
sample_n(exam1, 3) # Random sampling with a fixed number
IDExam1Exam2Quiz
55.03.11.23.9
33.02.30.03.3
22444
In [26]:
sample_frac(exam1, 0.4) # Random sampling with a fixed fraction
IDExam1Exam2Quiz
44.02.21.03.3
22444
In [28]:
exam1[as.logical((1:nrow(exam1))%%2),] # 1-in-n sampling
IDExam1Exam2Quiz
11.03.32.03.7
33.02.30.03.3
55.03.11.23.9

Summarizes information on groups of records

In [4]:
exam1$Gender <- c("남","여","남","여","남")
In [5]:
write.csv(exam1,"exam3.csv")
In [6]:
by_gender = group_by(exam1, Gender)
summarise(by_gender, exam1=mean(Exam1), quiz=median(Quiz))
Genderexam1quiz
12.93.7
23.1 3.65
In [7]:
exam1 %>% group_by(Gender) %>% summarise_each(funs(min,max), Exam1, Exam2, Quiz)
GenderExam1_minExam2_minQuiz_minExam1_maxExam2_maxQuiz_max
12.30 3.33.32 3.9
22.21 3.34 4 4

Includes records with distinct values in specified fields

In [40]:
filter(exam1,!duplicated(Quiz))
IDExam1Exam2QuizGender
11 3.32 3.7
22 4 4 4
33 2.30 3.3
45 3.11.23.9
In [42]:
distinct(exam1, Quiz) # return unique values
Quiz
13.7
24
33.3
43.9

Allows new fields to be generated based on existing fields

In [44]:
exam1 <- mutate(exam1, ExamSum=Exam1+Exam2, ExamMean=ExamSum/2)
exam1
IDExam1Exam2QuizGenderExamSumExamMean
11 3.3 2 3.7 5.3 2.65
22 4 4 4 8 4
33 2.3 0 3.3 2.3 1.15
44 2.21 3.33.21.6
55 3.1 1.2 3.9 4.3 2.15

Allows fields to be renamed or removed

In [46]:
select(exam1, ID:Exam2)
IDExam1Exam2
11.03.32.0
2244
33.02.30.0
44.02.21.0
55.03.11.2
In [53]:
exam1 <- select(exam1, -ExamSum, -ExamMean)
exam1
Error in eval(expr, envir, enclos): 객체 'ExamSum'를 찾을 수 없습니다
Traceback:

1. select(exam1, -ExamSum, -ExamMean)
2. select_(.data, .dots = lazyeval::lazy_dots(...))
3. select_.data.frame(.data, .dots = lazyeval::lazy_dots(...))
4. select_vars_(names(.data), dots)
5. lazyeval::lazy_eval(args, names_list)
6. lapply(x, lazy_eval, data = data)
7. FUN(X[[i]], ...)
8. eval(x$expr, data, x$env)
9. eval(expr, envir, enclos)
In [55]:
rename(exam1, id=ID, quiz = Quiz, ex1=Exam1, ex2=Exam2)
idex1ex2quizGender
11 3.32 3.7
22 4 4 4
33 2.30 3.3
44 2.21 3.3
55 3.11.23.9

Changes the sort order of fields

In [56]:
select(exam1, ID, Quiz, Exam1:Exam2)
IDQuizExam1Exam2
11.03.73.32.0
22444
33.03.32.30.0
44.03.32.21.0
55.03.93.11.2

Allows values in existing fields to be replaced by new values

In [59]:
exam1$Extra <- c(1, 1, NA, NA, 2)
exam1
IDExam1Exam2QuizGenderExtra
11 3.32 3.71
22 4 4 4 1
33 2.30 3.3NA
44 2.21 3.3NA
55 3.11.23.92
In [62]:
exam1$Extra[is.na(exam1$Extra)] <- 0
exam1
IDExam1Exam2QuizGenderExtra
11 3.32 3.71
22 4 4 4 1
33 2.30 3.30
44 2.21 3.30
55 3.11.23.92

Merges records from multiple inputs

In [63]:
a <- 1:5
b <- c(3.1,4,2.3,5.2,2.1)
c <- c(2,4,0,1,1.2)
d <- c(1.5,2,3.6,6.3,2.9)

exam2 <- data.frame(CID=a,Exam3=b,Exam4=c,FinalExam=d)
In [64]:
merge(exam1, exam2, by.x="ID", by.y="CID")
IDExam1Exam2QuizGenderExtraExam3Exam4FinalExam
11 3.32 3.71 3.12 1.5
22 4 4 4 1 4 4 2
33 2.30 3.30 2.30 3.6
44 2.21 3.30 5.21 6.3
55 3.11.23.92 2.11.22.9

Transposes records to fields and fields to records

In [66]:
t(exam1)
ID12345
Exam13.34.02.32.23.1
Exam22.04.00.01.01.2
Quiz3.74.03.33.33.9
Gender
Extra11002

Creates new fields from one or more categorical fields

  • Averaging values
In [68]:
tapply(exam1$Quiz,exam1$Gender,sum) # 범주형 변수에만 적용이 가능하다. apply는 Matrix에 적용.
10.9
7.3

Creates new fields from one or more categorical fields

  • Melting & Casting
In [74]:
a <- c(1,1,1,1,2,2,2)
b <- c('a','b','c','a','a','b','b')
c <- 1:7
d <- 7:1

tr <- data.frame(id=a,site=b,pageview=c,dwelltime=d)
tr
idsitepageviewdwelltime
11a17
21b26
31c35
41a44
52a53
62b62
72b71
In [76]:
library(reshape)
In [79]:
tr.melt <- melt(tr, id.vars=c("id","site"),measure.vars=c("pageview","dwelltime")) # id.vars의 값들을 기준으로 measure.vars 값들을 펼침.
tr.melt
idsitevariablevalue
11 a pageview1
21 b pageview2
31 c pageview3
41 a pageview4
52 a pageview5
62 b pageview6
72 b pageview7
81 a dwelltime7
91 b dwelltime6
101 c dwelltime5
111 a dwelltime4
122 a dwelltime3
132 b dwelltime2
142 b dwelltime1
  • formular=var1~var2 : var1의 level을 행으로 var2의 level을 열 방향으로 설정해 value의 값을 function으로 집계
In [80]:
cast(tr.melt, id ~ site, sum, subset=variable=="pageview")
idabc
11523
2 2 513 0
In [82]:
cast(tr.melt, id+site~variable, length)
idsitepageviewdwelltime
11a22
21b11
31c11
42a11
52b22
In [84]:
cast(tr.melt, id ~ variable, mean, subset=variable=="pageview")
idpageview
11.02.5
226

Converts numeric fields into discrete pieces

In [12]:
exam1 <- read.csv("exam1.csv")
In [13]:
exam1 <-mutate(exam1, ExamSum=Exam1+Exam2)
In [14]:
exam1$Level <-cut(exam1$ExamSum,breaks=3,labels=F)
exam1
XIDExam1Exam2QuizExamSumLevel
11.01.03.32.03.75.32.0
22244483
33.03.02.30.03.32.31.0
44.04.02.21.03.33.21.0
55.05.03.11.23.94.32.0
In [15]:
exam1$Level <-cut(exam1$ExamSum,c(0,2,4,6,8),labels=F)
exam1
XIDExam1Exam2QuizExamSumLevel
11.01.03.32.03.75.33.0
22244484
33.03.02.30.03.32.32.0
44.04.02.21.03.33.22.0
55.05.03.11.23.94.33.0

백화점 데이터를 통해 아래 문제를 해결

1. 50대 기혼 여성 고객리스트를 아래와 같이 출력 하시오

 custid gender age marriage    residence        job
 36  46111     여  57     기혼   Gangnam-gu   개인사업
 37  46230     여  57     기혼  Jungnang-gu   개인사업
 38  46327     여  58     기혼    Seocho-gu   금융기관
 39  46441     여  58     기혼   Gangnam-gu   교육기관
 40  46554     여  58     기혼 Seodaemun-gu 정보서비스
 41  46869     여  59     기혼    Seocho-gu     건설업
In [16]:
cs <- read.table("dataCustomers.tab", sep="\t", header = T, stringsAsFactors = F)
tr <- read.table("dataTransactions.tab", sep="\t", header = T, stringsAsFactors = F)
head(cs,3)
head(tr,3)
custidgenderagemarriageresidencejob
110070 28 미혼 Yongsan-gu제조업
210139 28 미혼 Gangdong-gu정보서비스
310208 28 미혼 Gwangjin-gu제조업
datetimecustidstoreproductbrandcornerimportamountinstallment
12000-05-01 10:4318313 신촌점 4104840008000 샤넬 화장품 1 113000 3
22000-05-01 11:0018313 신촌점 2.7e+12 식품 일반식품 0 91950 3
32000-05-01 11:3327222 신촌점 4545370944500 까사미아 가구 0 598000 3
In [17]:
filter(cs, age >= 50 & age <= 59 & gender == "여" & marriage =="기혼") %>% tail()
custidgenderagemarriageresidencejob
3646111 57 기혼 Gangnam-gu개인사업
3746230 57 기혼 Jungnang-gu개인사업
3846327 58 기혼 Seocho-gu금융기관
3946441 58 기혼 Gangnam-gu교육기관
4046554 58 기혼 Seodaemun-gu정보서비스
4146869 59 기혼 Seocho-gu건설업

2) H백화점의 남녀별 평균나이를 계산

In [18]:
cs %>% group_by(gender) %>% summarize(Cnt = n()) #남녀수
cs %>% group_by(gender) %>% summarize(age = mean(age)) #남녀 평균 나이
#group_by(cs,gender) %>% summarize(age=mean(age))
genderCnt
1154
2346
genderage
140.7012987012987
239.1994219653179

3) 거주지역 전체 출력.

In [20]:
head(distinct(cs,residence),3)
head(unique(cs$residence),3)
residence
1Yongsan-gu
2Gangdong-gu
3Gwangjin-gu
  1. "Yongsan-gu"
  2. "Gangdong-gu"
  3. "Gwangjin-gu"

4) 지점별 수입품 과 국산품

In [27]:
head(tr,3)
datetimecustidstoreproductbrandcornerimportamountinstallment
12000-05-01 10:4318313 신촌점 4104840008000 샤넬 화장품 1 113000 3
22000-05-01 11:0018313 신촌점 2.7e+12 식품 일반식품 0 91950 3
32000-05-01 11:3327222 신촌점 4545370944500 까사미아 가구 0 598000 3
In [28]:
tr %>% group_by(store,import) %>% summarise(Cnt=n())
storeimportCnt
1무역점0 4363
2무역점1 592
3본점0 3090
4본점1 632
5신촌점0 5622
6신촌점1 488
7천호점0 3988
8천호점1 350

5) 남녀별로 건당 구매액의 최소값 중앙 값 최대값을 계산하여 아래와 같이 출력하시오.

In [22]:
tmp <- merge(cs,tr)
head(tmp,3)
custidgenderagemarriageresidencejobdatetimestoreproductbrandcornerimportamountinstallment
110070 28 미혼 Yongsan-gu 제조업 2000-10-17 19:10무역점 4800429513002 밀라노스토리무역캐릭터캐주얼 0 74000 1
210070 28 미혼 Yongsan-gu 제조업 2000-11-04 17:24본점 2800429313003 밀라노 본점 캐릭터캐주얼 0 298000 3
310070 28 미혼 Yongsan-gu 제조업 2000-09-03 11:33신촌점 4106530008200 메이컵포에버 화장품 1 18000 1
In [23]:
tmp %>% group_by(gender) %>% summarise_each(funs(min,median,max), amount)
genderminmedianmax
1650 51020 8000000
2840 54354 3930000

6) 총 구매액이 가장 많은 사람부터 적은 사람순으로 정렬

In [25]:
tr %>% group_by(custid) %>% summarise(amount = sum(amount)) %>% arrange(desc(amount)) %>% head(3)
custidamount
1 4280061672778
2 1596844478591
3 1349335480804