QVC Analysis (E-Commerce 판매 데이터 분석)

발표 자료

PPT

필요한 파일.

  • Customer master2.csv
  • order_data.csv
  • product.csv
  • timezone.csv
  • zipcode.csv
  • Product airtime.csv
  • pre_order_data.csv
  • *.txt 파일
In [46]:
library(stringr)
library(arules)
library(dplyr)
library(maps)
library(ggplot2)
library(ggmap)
library(gridExtra)
library(reshape2)
library(arulesSequences)
library(lubridate)
In [2]:
theme_set(theme_gray(base_size = 13))

Data Load

In [3]:
customer <- read.csv("data/Customer master2.csv")
customer$SHOPPER_SEGMENT_CODE <- as.factor(customer$SHOPPER_SEGMENT_CODE)
In [4]:
head(customer)
STATEZIP_CODECUSTOMER_NBRSHOPPER_SEGMENT_CODEcitylatitudelongitudetimezone
1AK 99501 442649 2 Anchorage 61.216799 -149.87828US/Alaska
2AK 99501 657615 3 Anchorage 61.216799 -149.87828US/Alaska
3AK 99501 315353 1 Anchorage 61.216799 -149.87828US/Alaska
4AK 99501 235837 2 Anchorage 61.216799 -149.87828US/Alaska
5AK 99502 516194 3 Anchorage 61.153693 -149.95932US/Alaska
6AK 99502 224909 0 Anchorage 61.153693 -149.95932US/Alaska
In [83]:
order_df <- read.csv("data/order_data.csv", stringsAsFactors = F)
order_df$SHOPPER_SEGMENT_CODE[is.na(order_df$SHOPPER_SEGMENT_CODE)] <- 0
order_df$SHOPPER_SEGMENT_CODE <- as.factor(order_df$SHOPPER_SEGMENT_CODE)
In [6]:
product <- read.csv("data/Product master2.csv", stringsAsFactor = F)
In [7]:
zip <- read.csv("data/zipcode.csv", stringsAsFactors = F)

Data 확인.

1. 고객 분포도. Zip Code 별 고객 분포도.

In [8]:
v_customer <- customer %>% group_by(ZIP_CODE) %>% summarize(cnt = n()) %>% filter(cnt > 5)
v_customer <- merge(v_customer,zip[,c(1,3:5)],by.x="ZIP_CODE",by.y="zip")
Warning message:
: failed to assign NativeSymbolInfo for lhs since lhs is already defined in the 'lazyeval' namespaceWarning message:
: failed to assign NativeSymbolInfo for rhs since rhs is already defined in the 'lazyeval' namespace
In [9]:
head(v_customer)
ZIP_CODEcntstatelatitudelongitude
11001 8 MA 42.07061 -72.62029
21007 6 MA 42.278424-72.411
31013 9 MA 42.161492-72.60316
41020 9 MA 42.175992-72.57265
51027 6 MA 42.27279 -72.68677
61028 8 MA 42.066795-72.5102

지도 그리기

In [10]:
options(repr.plot.width=5, repr.plot.height=5)
In [11]:
map<-get_map(location='united states', zoom=4, maptype = "terrain",
             source='google',color='color')

ggmap(map) + geom_point(
  aes(x=longitude, y=latitude, show_guide = TRUE, colour=cnt, size=cnt),
  data=v_customer, alpha=1,na.rm = T)  +
  scale_color_gradient(low="beige", high="blue")
Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=united+states&zoom=4&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=united%20states&sensor=false

지도그리기 2

In [12]:
options(repr.plot.width=5, repr.plot.height=3)
In [13]:
us <- map_data("state")
city <- customer %>% group_by(city) %>% summarize(cnt = n())
city <- city %>% mutate(avg = cnt/mean(cnt))
city$city <- tolower(city$city)

gg <- ggplot()
gg <- gg + geom_map(data=us, map=us,
                    aes(x=long, y=lat, map_id=region),
                    fill="#ffffff", color="#ffffff", size=0.25)

gg <- gg + geom_map(data=city, map=us,
                    aes(fill=cnt, map_id=city),
                    color="#ffffff", size=0.25)
gg <- gg + scale_fill_continuous(low='thistle2', high='darkred',
                                 guide='colorbar')
gg <- gg + labs(x=NULL, y=NULL)
gg <- gg + coord_map("albers", lat0 = 39, lat1 = 45)
gg <- gg + theme(panel.border = element_blank())
gg <- gg + theme(panel.background = element_blank())
gg <- gg + theme(axis.ticks = element_blank())
gg <- gg + theme(axis.text = element_blank())
gg

timezone 별 고객 분포도

In [14]:
options(repr.plot.width=7, repr.plot.height=3)
In [15]:
timezone_customer <- customer %>% group_by(timezone) %>% summarize(cnt=n())
ggplot(timezone_customer,aes(x=timezone,y=cnt,fill=timezone)) + geom_bar(stat="identity")

2. State 별 고객 분포

In [16]:
state_number <- v_customer %>% group_by(state) %>% summarize(cnt = sum(cnt)) %>% arrange(desc(cnt))
ggplot(v_customer, aes(x=state,y=cnt)) + geom_bar(stat="identity") + theme(axis.text.x = element_text(angle = 90, hjust = 1))

3. State 별 Segment 별 고객 분포

In [17]:
customer$SHOPPER_SEGMENT_CODE[is.na(customer$SHOPPER_SEGMENT_CODE)] <- 0
customer$SHOPPER_SEGMENT_CODE <- factor(customer$SHOPPER_SEGMENT_CODE)
seg_customer <- customer
In [18]:
names(seg_customer)[4] <- "CODE"
In [19]:
ggplot(seg_customer,aes(x=STATE,fill=CODE)) + geom_bar() + theme(axis.text.x = element_text(angle = 90, hjust = 1))

4. segment 별 고객 분포

In [20]:
segment_customer <- customer %>% group_by(SHOPPER_SEGMENT_CODE) %>% summarize(cu_cnt = n()) %>% mutate(prob = round((cu_cnt/sum(cu_cnt)),2)*100)
segment_customer$SHOPPER_SEGMENT_CODE[is.na(segment_customer$SHOPPER_SEGMENT_CODE)] <- 0
In [21]:
bp<- ggplot(segment_customer, aes(x="", y=cu_cnt, fill=as.factor(SHOPPER_SEGMENT_CODE)))+
  geom_bar(width = 1, stat = "identity")
pie <- bp + coord_polar("y", start=0)
# scale_fill_brewer("Blues") + blank_theme +
pie +
  #theme(axis.text.x=element_blank())+
  geom_text(aes(y = cu_cnt/3 + c(0, cumsum(cu_cnt)[-length(cu_cnt)]),
                label = paste0(SHOPPER_SEGMENT_CODE,":",prob,"%")), size=3)

5. segment 별 구매 횟수

In [22]:
segment_order <- order_df %>% group_by(SHOPPER_SEGMENT_CODE) %>% summarize(or_cnt=n())
segment_order$percent <- round(prop.table(segment_order$or_cnt),2)

ggplot(segment_order,aes(x=SHOPPER_SEGMENT_CODE,y=or_cnt,fill=SHOPPER_SEGMENT_CODE)) + geom_bar(stat="identity",colour="black")

5.1 segment 별 회원수

In [23]:
segment_cus_order <- order_df %>% distinct(CUSTOMER_NBR,SHOPPER_SEGMENT_CODE) %>% group_by(SHOPPER_SEGMENT_CODE) %>% summarize(or_cnt=n())
 # 합쳐서
seg <- merge(segment_customer,segment_order,by="SHOPPER_SEGMENT_CODE")
seg <- seg %>% mutate(avg = round(or_cnt/cu_cnt,2))
ggplot(seg,aes(x=SHOPPER_SEGMENT_CODE,y=avg,fill=SHOPPER_SEGMENT_CODE)) + geom_bar(stat="identity",colour="black")

5.2 월별 판매 수

In [24]:
#  월별 판매 수
mon_order <- order_df %>% group_by(ORDER_MON) %>% summarize(cnt =n())
mon_order$ORDER_MON <- as.factor(mon_order$ORDER_MON)
mon_order$per <- round(prop.table(mon_order$cnt),2) * 100
ggplot(data=mon_order,aes(x=ORDER_MON,y=cnt,fill=ORDER_MON)) +geom_bar(stat="identity")

6. 요일별 주문 Count

In [25]:
v_day <- order_df %>% group_by(ORDER_DAY) %>% summarize(cnt=n())
v_day$ORDER_DAY = factor(v_day$ORDER_DAY,labels=c("SUN","MON","TUE","WED","THU","FRI","SAT"),ordered=TRUE)
ggplot(v_day, aes(x=ORDER_DAY,y=cnt,fill=ORDER_DAY)) + geom_bar(stat="identity",colour="black")

7. 카테고리별 구매 횟수 5개

In [26]:
order_per_cate <- order_df %>% group_by(MERCH_DIV_DESC) %>% summarize(cnt = n()) %>% arrange(desc(cnt))
ggplot(head(order_per_cate), aes(x=MERCH_DIV_DESC,y=cnt,fill=MERCH_DIV_DESC)) + geom_bar(stat="identity",colour="black")

8. 가장 많이 팔린 제품 상위 5개

In [27]:
order_per_product <- order_df %>% group_by(PACKING_SLIP_DESC) %>% summarize(cnt = n()) %>% arrange(desc(cnt))
ggplot(head(order_per_product), aes(x=PACKING_SLIP_DESC,y=cnt,fill=PACKING_SLIP_DESC)) + geom_bar(stat="identity")

9. 시간대별 판매 수

In [28]:
v_time_cate <- order_df %>% group_by(ORDER_HOUR,MERCH_DIV_DESC) %>% summarize(cnt = n()) # 다른 데서 line plot으로 시간대별로 다르게 표현.
time_order <- v_time_cate %>% group_by(ORDER_HOUR) %>% summarize(cnt = sum(cnt))
ggplot(data=time_order, aes(x=factor(ORDER_HOUR), y=cnt,group=1)) + geom_line(color="red") + geom_point(color="blue")

10. Timezone 별 주문 횟수

In [29]:
timezone_order <- order_df %>% group_by(timezone) %>% summarize(cnt = n()) %>% mutate(prob = round((cnt/sum(cnt)),3)*100)
timezone_order
timezonecntprob
1CST 879123.4
2EST 2150457.2
3MST 18895
4PST 520113.8
5US/Alaska151 0.4
6US/Hawaii82 0.2
In [48]:
timezone_customer <- customer %>% group_by(timezone) %>% summarize(cnt=n())
timezone_cluster_customer <- customer %>% group_by(timezone,cluster) %>% summarize(cnt = n())
timezone_segment_customer <- customer %>% group_by(timezone,SHOPPER_SEGMENT_CODE) %>% summarize(cnt=n())

df_order_timezone <- data.frame(timezone = timezone_order$timezone, timezone_cnt = timezone_order$cnt)
df_customer_timezone <- data.frame(timezone = timezone_customer$timezone, customer_cnt = timezone_customer$cnt)
df_timezone <- merge(df_customer_timezone,df_order_timezone,by="timezone")
df_timezone <- df_timezone %>% mutate(per_order = round(timezone_cnt / customer_cnt,2))

timezone_order <- left_join(timezone_order,df_timezone[,c(1,4)])
Joining, by = "timezone"
Warning message:
In left_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining factor and character vector, coercing into character vector
In [50]:
options(repr.plot.height=5)
p1 <- ggplot(timezone_order,aes(x=timezone,y=prob,fill=timezone)) + geom_bar(stat="identity")
p2 <- ggplot(timezone_order,aes(x=timezone,y=per_order,fill=timezone)) + geom_bar(stat="identity")
gridExtra::grid.arrange(p1, p2)

11. Segmnet 요일 구매 패턴

In [52]:
options(repr.plot.height=3)
segment_mon_order <- order_df %>% group_by(SHOPPER_SEGMENT_CODE,ORDER_DAY) %>% summarize(or_cnt=n())
segment_mon_order$ORDER_DAY = factor(segment_mon_order$ORDER_DAY,labels=c("SUN","MON","TUE","WED","THU","FRI","SAT"),ordered=TRUE)
ggplot(segment_mon_order,aes(x=SHOPPER_SEGMENT_CODE,y=or_cnt,fill=ORDER_DAY)) + geom_bar(stat="identity",position="dodge")
Warning message:
: Removed 7 rows containing missing values (geom_bar).

12 . 광고 시청 유무에 따른 구매 및 전체 광고 흐름

In [62]:
air_product <- read.csv("data/Product airtime.csv")
head(air_product)
PRODUCT_IDAIR_DATEPRODUCT_AIRTIME_MINSPRODUCT_START_TMSPRODUCT_STOP_TMS
12186 2012-10-16 0:000.38 2012-10-16 0:432012-10-16 0:44
22186 2012-10-16 0:005.08 2012-10-16 0:482012-10-16 0:53
32478 2012-10-16 0:000.5 2012-10-16 0:252012-10-16 0:26
42478 2012-10-16 0:0012.53 2012-10-16 0:312012-10-16 0:43
56283 2012-10-16 0:000.43 2012-10-16 0:472012-10-16 0:48
66283 2012-10-16 0:004.73 2012-10-16 0:532012-10-16 0:58

12-1 시간대별 광고 전체

In [63]:
air_product$PRODUCT_START_TMS  <- ymd_hm(air_product$PRODUCT_START_TMS)
air_product$PRODUCT_STOP_TMS  <- ymd_hm(air_product$PRODUCT_STOP_TMS)
air_product$AIR_DATE <- ymd_hm(air_product$AIR_DATE)
air_product$START_HOUR <- hour(air_product$PRODUCT_START_TMS)
air_product$STOP_HOUR <- hour(air_product$PRODUCT_STOP_TMS)
air_product$AIR_MON <- month(air_product$AIR_DATE)
head(air_product)
  PRODUCT_ID   AIR_DATE PRODUCT_AIRTIME_MINS   PRODUCT_START_TMS
1       2186 2012-10-16                 0.38 2012-10-16 00:43:00
2       2186 2012-10-16                 5.08 2012-10-16 00:48:00
3       2478 2012-10-16                 0.50 2012-10-16 00:25:00
4       2478 2012-10-16                12.53 2012-10-16 00:31:00
5       6283 2012-10-16                 0.43 2012-10-16 00:47:00
6       6283 2012-10-16                 4.73 2012-10-16 00:53:00
     PRODUCT_STOP_TMS START_HOUR STOP_HOUR AIR_MON
1 2012-10-16 00:44:00          0         0      10
2 2012-10-16 00:53:00          0         0      10
3 2012-10-16 00:26:00          0         0      10
4 2012-10-16 00:43:00          0         0      10
5 2012-10-16 00:48:00          0         0      10
6 2012-10-16 00:58:00          0         0      10
In [66]:
total_air_hour <- air_product %>% group_by(START_HOUR) %>% summarize(cnt = n())
total_air_hour$type = "Total_Ad"
ggplot(total_air_hour,aes(x=factor(START_HOUR),y=cnt,group=2))  + geom_line(size = 2,color="red") + geom_point(color="blue",size=2)
In [68]:
# 구매 제품의 방송 시간에 따른 구매 ####
tmp_df <- order_df[!is.na(order_df$AIR_DATE),]
air_time_order <- tmp_df %>% group_by(START_HOUR) %>% summarize(cnt = n())
In [75]:
air_time_order$type="ConnectToSell"
In [76]:
air_df <- rbind(total_air_hour,air_time_order)
air_df$START_HOUR <- factor(air_df$START_HOUR)
In [77]:
ggplot(air_df,aes(x=START_HOUR,y=cnt)) + geom_line(aes(group=type,color=type),size = 2) + geom_point(aes(group=type,color=type),size = 2)
In [53]:
# 광고를 본사람 vs 안본사람 시간대 비교 ####
air_order_df <- order_df[!is.na(order_df$AIR_DATE),]
no_air_order_df <- order_df[is.na(order_df$AIR_DATE),]

air_time_order <- air_order_df %>% group_by(ORDER_HOUR) %>% summarize(cnt=n())
no_air_time_order <- no_air_order_df %>% group_by(ORDER_HOUR) %>% summarize(cnt=n())

air_time_order$type="air"
no_air_time_order$type="noair"

compare_air <- rbind(air_time_order,no_air_time_order)
ggplot(compare_air,aes(x=ORDER_HOUR,y=cnt)) + geom_line(aes(group=type,color=type),size = 2) + geom_point(aes(group=type,color=type),size = 2)

전체 주문

  • 0 ~ 6시 : 타 시간대 대비 광고량 대비 구매량이 현저히 적음
  • 10 ~ 12시 : 주문 횟수 > 광고 횟수, 광고 ↑ : 구매 ↑
  • 20 ~ 23시 : 광고 유무 -> 구매 영향도 높음, 매출 노ㅠ은 짧은 광고 시간의 상품 노출 필요
In [80]:
names(air_df)[1] <- "HOUR"
names(compare_air)[1] <- "HOUR"

# 전체 주문 확보 ####
total_order <- order_df %>% group_by(ORDER_HOUR) %>% summarize(cnt=n())
names(total_order)[1] <- "HOUR"
total_order$type <- "TotalOrder"

tmp_df <- rbind(air_df,compare_air)
tmp_df <- rbind(tmp_df,total_order)
ggplot(tmp_df,aes(x=HOUR,y=cnt)) + geom_line(aes(group=type,color=type),size = 1) + geom_point(aes(group=type,color=type),size = 1)

Answer the Questions

1. What is the next product a customer will buy in the next month given their previous buying behavior and product airtime?

  • 제품의 경우 같은 Cluster 내 고객들의 구매 내역 중 내가 사지 않은 내역을 추천
In [88]:
order_df <- read.csv("data/order_data_cluster.csv", stringsAsFactors = F)
customer <- read.csv("data/customer_with_cluster.csv",stringsAsFactors = F)
In [85]:
order_df$SHOPPER_SEGMENT_CODE[is.na(order_df$SHOPPER_SEGMENT_CODE)] <- 0
order_df$SHOPPER_SEGMENT_CODE <- as.factor(order_df$SHOPPER_SEGMENT_CODE)
In [31]:
for(i in 1:9){
  tmp_df <- order_df %>% filter(cluster==i)
  assign(paste0("cluster_",i),tmp_df)
}
In [32]:
for(i in 1:9){
  tmp_df <- get(paste0("cluster_",i))
  tmp_df <- tmp_df %>% group_by(CUSTOMER_NBR,MERCH_DIV_DESC) %>% summarize(cnt = n()) %>% arrange(CUSTOMER_NBR)
  assign(paste0("cluster_cate_",i),as.data.frame(tmp_df))
}
In [33]:
for(i in 1:9){
  tmp_df <- get(paste0("cluster_",i))
  tmp_df <- tmp_df %>% group_by(CUSTOMER_NBR,PRODUCT_ID) %>% summarize(cnt = n()) %>% arrange(CUSTOMER_NBR)
  assign(paste0("cluster_product_",i),as.data.frame(tmp_df))
}

id별로 추천 가져오는 Function

1. Product 별

  • Product 의 경우 한번 구매한 제품에 대해서 재 구매 확률이 낮으므로 Cluster별로 구매 했던 내역에서 고객이 주문한 내역을 제외한 빈도가 가장 높은 순으로 추천 하여 구매를 예측
In [34]:
getProductRecommand <- function(x){
  custId <- x
  custCluster <- customer[customer$CUSTOMER_NBR==custId,]$cluster
  in_df <- get(paste0("cluster_product_",custCluster))
  in_product_list <- in_df[in_df$CUSTOMER_NBR==custId,]$PRODUCT_ID
  in_product_list <- as.data.frame(in_product_list)
  other_df <- in_df %>% filter(CUSTOMER_NBR!=custId) %>% group_by(PRODUCT_ID) %>% summarize(cnt = sum(cnt)) %>% arrange(desc(cnt))
  other_df <- as.data.frame(other_df)
  other_df$percent <- round(prop.table(other_df$cnt),2)
  result_df <- other_df %>% filter(!(PRODUCT_ID %in% in_product_list$in_producut_list))
  result_df <- left_join(result_df,product)
  names(in_product_list)[1] <- "PRODUCT_ID"
  in_product_list <- left_join(in_product_list,product)
  print(in_product_list[,c(1,2)])
  print(head(result_df[,c(1,4,2,3)]))
}

2. In what product category is a customer likely to buy their next product, given their previous buying behavior and product airtime?

  • 카테고리의 경우 같은 카테고리를 구매할 가능성이 높으므로 연관분석을 통해 추천

2. Category

In [35]:
getCateRecommand <- function(x){
  custId <- x
  custCluster <- customer[customer$CUSTOMER_NBR==custId,]$cluster
  if(length(custCluster)==0){
    return(0)
  }
  baskets <-  read_baskets(con  = paste0("data/item_cate_",custCluster,".txt"),sep = "#", info = c("sequenceID","eventID","SIZE"))
  baskets <- cspade(baskets, parameter = list(support = 0.0001), control = list(verbose = TRUE))
  baskets_df <- as(baskets, "data.frame")
  baskets_df <- baskets_df[order(baskets_df$support,decreasing = T),]
  baskets_df$sequence <- gsub(pattern = "<\\{\"","",baskets_df$sequence)
  baskets_df$sequence <- gsub(pattern = "\"\\}>","",baskets_df$sequence)
  baskets_df$sequence <- gsub(pattern = "\"","",baskets_df$sequence)
  last_df <- order_df[order_df$CUSTOMER_NBR==custId,] %>%
    arrange(desc(ORDER_DATES)) %>%
    select(MERCH_DIV_DESC)
  last_cate <- last_df$MERCH_DIV_DESC
  support_df <- baskets_df %>% filter(str_detect(sequence,last_cate))
  print(paste0(last_cate))
  print(head(support_df[-1,]))
}
In [36]:
getRecommand <- function(cate,y){
  if(cate=="cate"){
    getCateRecommand(y)
  }else{
    getProductRecommand(y)
  }
}
In [37]:
getRecommand("cate",1275)
parameter specification:
support : 1e-04
maxsize :    10
maxlen  :    10

algorithmic control:
bfstype  : FALSE
verbose  :  TRUE
summary  : FALSE
tidLists : FALSE

preprocessing ... 1 partition(s), 0.09 MB [0.66s]
mining transactions ... 0 MB [0.4s]
reading sequences ... [0.11s]

total elapsed time: 1.17s
[1] "Entertainment" "Health/Beauty" "Health/Beauty" "Health/Beauty"
[5] "Health/Beauty" "Health/Beauty"
                       sequence      support
2      Health/Beauty,Home Decor 0.0032706460
3 Health/Beauty,IQVC Divisional 0.0029980921
4         Health/Beauty,Apparel 0.0027255383
5     Accessories,Health/Beauty 0.0010902153
6         Health/Beauty,Jewelry 0.0008176615
7      Health/Beauty,Housewares 0.0008176615
In [38]:
getRecommand("product",749351)
Joining, by = "PRODUCT_ID"
Joining, by = "PRODUCT_ID"
  PRODUCT_ID                               PACKING_SLIP_DESC
1       9887      Layers by Lizden Marvelush Smooth & Ribbed
2      11916  Milana Bra by Genie Set of 2 Lace Overlay Bras
3      32338          Set of 20 Faceted Droplet Ornaments by
4      44978     UltraFine Silver 8mm Reversible Diamond Cut
5      49825 Simply Stacks Sterling 4mm Round Peridot Solita
6      49826 Simply Stacks Sterling 4mm Round Pink Tourmalin
  PRODUCT_ID                              PACKING_SLIP_DESC cnt percent
1      61109    FlashPad 2.0 Touchscreen Handheld Game with  51    0.02
2      10811 BareTraps Tall Shaft Boots w/ Ruching & Gore -  42    0.01
3      11916 Milana Bra by Genie Set of 2 Lace Overlay Bras  40    0.01
4      32565   Berkshire Blanket Super Soft Oversized Plush  31    0.01
5      32612   Bethlehem Lights Indoor Outdoor 20 ft 50 LED  29    0.01
6       9887     Layers by Lizden Marvelush Smooth & Ribbed  26    0.01

3. What are the products and categories that sell better in a particular geographic, time zone, and customer segment?

3-1 particular geographic

In [39]:
v3_1 <- order_df %>%
  group_by(STATE,PACKING_SLIP_DESC) %>%
  summarize(Pro_cnt=n()) %>%
  slice(which.max(Pro_cnt))
In [40]:
v3_1_2 <- order_df %>% group_by(STATE,MERCH_DIV_DESC) %>% summarize(Category_cnt=n()) %>% slice(which.max(Category_cnt))
In [41]:
head(v3_1)
STATEPACKING_SLIP_DESCPro_cnt
1AK Berkshire Blanket Super Soft Oversized Plush11
2AL BareTraps Tall Shaft Boots w/ Ruching & Gore -8
3AR Berkshire Blanket Super Soft Oversized Plush9
4AZ WEN by Chaz Dean Fall Ginger Pumpkin7
5CA Berkshire Blanket Super Soft Oversized Plush41
6CO Berkshire Blanket Super Soft Oversized Plush12
In [42]:
head(v3_1_2)
STATEMERCH_DIV_DESCCategory_cnt
1AK Home Decor34
2AL Home Decor85
3AR Apparel46
4AZ Home Decor91
5CA Apparel856
6CO Home Decor132

3-2 Time Zone

In [43]:
v3_3 <- order_df %>% group_by(timezone,PACKING_SLIP_DESC) %>% summarize(Hour_P_cnt=n()) %>% slice(which.max(Hour_P_cnt))
v3_4 <- order_df %>% group_by(timezone,MERCH_DIV_DESC) %>% summarize(Hour_C_cnt=n()) %>% slice(which.max(Hour_C_cnt))
In [44]:
head(v3_3)
timezonePACKING_SLIP_DESCHour_P_cnt
1CST6CDT Berkshire Blanket Super Soft Oversized Plush155
2EST5EDT FlashPad 2.0 Touchscreen Handheld Game with292
3MST WEN by Chaz Dean Fall Ginger Pumpkin7
4MST7MDT FlashPad 2.0 Touchscreen Handheld Game with49
5PST8PDT Milana Bra by Genie Set of 2 Lace Overlay Bras66
6US/Alaska Berkshire Blanket Super Soft Oversized Plush11
In [45]:
head(v3_4)
timezoneMERCH_DIV_DESCHour_C_cnt
1CST6CDT Home Decor1855
2EST5EDT Home Decor4656
3MST Home Decor91
4MST7MDT Home Decor334
5PST8PDTApparel1133
6US/Alaska Home Decor34

3-3 Segment

In [81]:
cs_prodcut <- order_df %>%
  group_by(SHOPPER_SEGMENT_CODE,PACKING_SLIP_DESC) %>%
  summarize(pro_cnt=n()) %>%
  slice(which.max(pro_cnt))
head(cs_prodcut)
SHOPPER_SEGMENT_CODEPACKING_SLIP_DESCpro_cnt
11 FlashPad 2.0 Touchscreen Handheld Game with86
22 Bethlehem Lights Product Recall Packet26
33 Bethlehem Lights Product Recall Packet77
44 FlashPad 2.0 Touchscreen Handheld Game with335
55 Berkshire Blanket Super Soft Oversized Plush34
6NA FlashPad 2.0 Touchscreen Handheld Game with82
In [82]:
cs_category <- order_df %>%
  group_by(SHOPPER_SEGMENT_CODE,MERCH_DIV_DESC) %>%
  summarize(cate_cnt=n()) %>%
  slice(which.max(cate_cnt))
head(cs_category)
SHOPPER_SEGMENT_CODEMERCH_DIV_DESCcate_cnt
11 Home Decor996
22 Home Decor359
33 Home Decor562
44 Apparel6016
55 Health/Beauty473
6NA Home Decor1040

3-4 TimeZone & Segment Cross

In [96]:
# 3. 타임존, 세그먼트별 잘팔리는 제품. 
tzsg_product <- order_df %>% group_by(timezone,SHOPPER_SEGMENT_CODE,PACKING_SLIP_DESC) %>% summarize(tzsg_pro_cnt=n()) %>% slice(which.max(tzsg_pro_cnt))
tzsg_category <- order_df %>% group_by(timezone,SHOPPER_SEGMENT_CODE,MERCH_DIV_DESC) %>% summarize(tzsg_cate_cnt=n()) %>% slice(which.max(tzsg_cate_cnt))
tzsg_product$SHOPPER_SEGMENT_CODE[is.na(tzsg_product$SHOPPER_SEGMENT_CODE)] <- 0
tzsg_category$SHOPPER_SEGMENT_CODE[is.na(tzsg_category$SHOPPER_SEGMENT_CODE)] <- 0
In [97]:
head(tzsg_category)
timezoneSHOPPER_SEGMENT_CODEMERCH_DIV_DESCtzsg_cate_cnt
1CST 1 Home Decor257
2CST 2 Home Decor79
3CST 3 Home Decor123
4CST 4 Apparel1398
5CST 5 Health/Beauty108
6CST 0 Home Decor273
In [98]:
tail(tzsg_category)
timezoneSHOPPER_SEGMENT_CODEMERCH_DIV_DESCtzsg_cate_cnt
1US/Hawaii 1 Home Decor7
2US/Hawaii 2 Accessories1
3US/Hawaii 3 Entertainment3
4US/Hawaii4 Apparel 10
5US/Hawaii 5 Health/Beauty2
6US/Hawaii 0 Health/Beauty2

4. Is there a best time of day to sell a particular product or product category?

In [99]:
product_per_hour <- order_df %>%
  group_by(PACKING_SLIP_DESC,ORDER_HOUR) %>%
  summarize(pro_cnt=n()) %>%
  slice(which.max(pro_cnt)) %>%
  filter(pro_cnt>10) %>% arrange(ORDER_HOUR)
In [100]:
head(product_per_hour)
PACKING_SLIP_DESCORDER_HOURpro_cnt
1Berkshire Blanket Super Soft Oversized Plush0 132
2Duraflame 750W/1500W Electric Stove Heater with0 17
3FlashPad 2.0 Touchscreen Handheld Game with0 66
4INFOtainment Apple iPad Charging and Docking0 33
5Josie Maran Art of Giving 8-pc Argan Bath &0 20
6Judith Ripka Stainless Steel Chronograph Watch0 16

5.What is the brand affinity (personal connection with the brands QVC sells) for QVC’s different customer segments?

  • 간단하게 답변한다면 Segment별로 좋아하는 Brand를 가지고 오면 된다.
  • 하지만 각 브랜드별로 어떤 상관관계가 있고 구매를 촉진 시키는지 Cross Selling이 가능한지 여부를 확인해야한다.
In [101]:
segment_brand <- order_df %>% group_by(SHOPPER_SEGMENT_CODE,BRAND_NAME) %>% summarize(cnt = n()) %>% arrange(SHOPPER_SEGMENT_CODE,desc(cnt))
segment_brand_max <- order_df %>% filter(BRAND_NAME!="Not Known" & BRAND_NAME!="N/A") %>% group_by(SHOPPER_SEGMENT_CODE,BRAND_NAME) %>% summarize(cnt = n()) %>% slice(which.max(cnt))
segment_brand_max$SHOPPER_SEGMENT_CODE[is.na(segment_brand_max$SHOPPER_SEGMENT_CODE)] <- 0
In [102]:
segment_brand_max
SHOPPER_SEGMENT_CODEBRAND_NAMEcnt
11 WEN218
22 Temp-tations46
33 Denim & Co.107
44 Denim & Co.1857
55 WEN95
60 Susan Graver246

브랜드별 상관관계

상위 20개 브랜드

In [107]:
count_brand <- order_df %>% group_by(BRAND_NAME) %>% summarize(cnt = n()) %>% arrange(desc(cnt)) %>% head(20)

brand_list <- count_brand$BRAND_NAME
brand_list <- as.data.frame(brand_list)
brand_rel <- order_df %>% filter(BRAND_NAME %in% brand_list$brand_list & BRAND_NAME != "Not Known" & BRAND_NAME != "N/A") %>% group_by(CUSTOMER_NBR,BRAND_NAME) %>% summarize(cnt = n())
brand_rel <- dcast(CUSTOMER_NBR ~ BRAND_NAME, data = brand_rel, value.var = "cnt")
brand_rel[is.na(brand_rel)] <- 0
brand_cor <- round(cor(brand_rel),2)
brand_cor <- as.data.frame(brand_cor)
In [110]:
library(psych)
In [111]:
options(repr.plot.width=8, repr.plot.height=8)
pairs.panels(brand_rel)
In [106]:
brand_cor
CUSTOMER_NBRBareTrapsBethlehem LightsClarksDenim & Co.EmjoiFlash PadGenie BraIsaac Mizrahi Live!Josie MaranLiz Claiborne New YorkLizdenMally BeautyphilosophyQuacker FactorySusan GraverTemp-tationsValerie Parr HillWEN
CUSTOMER_NBR 1.00 0.01 0.00-0.02 0.02 0.00-0.01 0.02 0.02 0.00 0.00 0.01-0.01 0.01 0.00 0.03 0.02-0.02 0.01
BareTraps 0.01 1.00 0.02-0.01 0.04 0.01 0.02 0.00 0.04 0.00 0.01 0.03-0.01 0.00-0.01 0.00 0.01 0.04-0.04
Bethlehem Lights 0.00 0.02 1.00 0.01 0.07-0.02 0.06 0.04 0.05 0.02 0.02 0.03 0.02 0.01 0.03 0.02 0.05 0.13-0.02
Clarks-0.02-0.01 0.01 1.00 0.05 0.01-0.01 0.02 0.02 0.00 0.04 0.04-0.02 0.01 0.00 0.04-0.01 0.01-0.03
Denim & Co. 0.02 0.04 0.07 0.05 1.00 0.01-0.01 0.06 0.11 0.02 0.22 0.14 0.05 0.10 0.24 0.36 0.04 0.11-0.01
Emjoi 0.00 0.01-0.02 0.01 0.01 1.00 0.03 0.01 0.01 0.00 0.02 0.02 0.00-0.01-0.01 0.03-0.01 0.02-0.03
Flash Pad-0.01 0.02 0.06-0.01-0.01 0.03 1.00 0.06 0.01 0.00-0.02 0.03-0.02-0.03 0.01-0.01-0.03 0.01-0.03
Genie Bra 0.02 0.00 0.04 0.02 0.06 0.01 0.06 1.00 0.09 0.03 0.11 0.10 0.02 0.02 0.05 0.13 0.01 0.01-0.03
Isaac Mizrahi Live! 0.02 0.04 0.05 0.02 0.11 0.01 0.01 0.09 1.00 0.03 0.28 0.11 0.02 0.06 0.09 0.17 0.01 0.01-0.01
Josie Maran 0.00 0.00 0.02 0.00 0.02 0.00 0.00 0.03 0.03 1.00 0.03 0.02 0.08 0.06-0.01 0.01-0.01 0.05 0.04
Liz Claiborne New York 0.00 0.01 0.02 0.04 0.22 0.02-0.02 0.11 0.28 0.03 1.00 0.16 0.05 0.02 0.23 0.44 0.01 0.02-0.01
Lizden 0.01 0.03 0.03 0.04 0.14 0.02 0.03 0.10 0.11 0.02 0.16 1.00 0.02 0.04 0.12 0.21 0.02 0.07-0.01
Mally Beauty-0.01-0.01 0.02-0.02 0.05 0.00-0.02 0.02 0.02 0.08 0.05 0.02 1.00 0.04 0.06 0.08-0.01 0.04 0.01
philosophy 0.01 0.00 0.01 0.01 0.10-0.01-0.03 0.02 0.06 0.06 0.02 0.04 0.04 1.00 0.00 0.03 0.00 0.06 0.00
Quacker Factory 0.00-0.01 0.03 0.00 0.24-0.01 0.01 0.05 0.09-0.01 0.23 0.12 0.06 0.00 1.00 0.37 0.01 0.04-0.03
Susan Graver 0.03 0.00 0.02 0.04 0.36 0.03-0.01 0.13 0.17 0.01 0.44 0.21 0.08 0.03 0.37 1.00 0.01 0.03-0.02
Temp-tations 0.02 0.01 0.05-0.01 0.04-0.01-0.03 0.01 0.01-0.01 0.01 0.02-0.01 0.00 0.01 0.01 1.00 0.11-0.03
Valerie Parr Hill-0.02 0.04 0.13 0.01 0.11 0.02 0.01 0.01 0.01 0.05 0.02 0.07 0.04 0.06 0.04 0.03 0.11 1.00-0.02
WEN 0.01-0.04-0.02-0.03-0.01-0.03-0.03-0.03-0.01 0.04-0.01-0.01 0.01 0.00-0.03-0.02-0.03-0.02 1.00

결론 : 최적의 방송시간 및 마케팅전략