QVC Analysis (E-Commerce 판매 데이터 분석)¶

전처리 과정은 Jupyter 메모리상 중간에 되지를 않게 되어 .R 내용을 확인
Data_Handling
Making OrderData
Clustering
Recommand System

발표 자료¶

필요한 파일.¶

Customer master2.csv
order_data.csv
product.csv
timezone.csv
zipcode.csv
Product airtime.csv
pre_order_data.csv
*.txt 파일

library(stringr)
library(arules)
library(dplyr)
library(maps)
library(ggplot2)
library(ggmap)
library(gridExtra)
library(reshape2)
library(arulesSequences)
library(lubridate)

theme_set(theme_gray(base_size = 13))

Data Load¶

customer <- read.csv("data/Customer master2.csv")
customer$SHOPPER_SEGMENT_CODE <- as.factor(customer$SHOPPER_SEGMENT_CODE)

head(customer)

order_df <- read.csv("data/order_data.csv", stringsAsFactors = F)
order_df$SHOPPER_SEGMENT_CODE[is.na(order_df$SHOPPER_SEGMENT_CODE)] <- 0
order_df$SHOPPER_SEGMENT_CODE <- as.factor(order_df$SHOPPER_SEGMENT_CODE)

product <- read.csv("data/Product master2.csv", stringsAsFactor = F)

zip <- read.csv("data/zipcode.csv", stringsAsFactors = F)

Data 확인.¶

1. 고객 분포도. Zip Code 별 고객 분포도.¶

v_customer <- customer %>% group_by(ZIP_CODE) %>% summarize(cnt = n()) %>% filter(cnt > 5)
v_customer <- merge(v_customer,zip[,c(1,3:5)],by.x="ZIP_CODE",by.y="zip")

Warning message:
: failed to assign NativeSymbolInfo for lhs since lhs is already defined in the 'lazyeval' namespaceWarning message:
: failed to assign NativeSymbolInfo for rhs since rhs is already defined in the 'lazyeval' namespace

head(v_customer)

지도 그리기¶

options(repr.plot.width=5, repr.plot.height=5)

map<-get_map(location='united states', zoom=4, maptype = "terrain",
             source='google',color='color')

ggmap(map) + geom_point(
  aes(x=longitude, y=latitude, show_guide = TRUE, colour=cnt, size=cnt),
  data=v_customer, alpha=1,na.rm = T)  +
  scale_color_gradient(low="beige", high="blue")

Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=united+states&zoom=4&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=united%20states&sensor=false

지도그리기 2¶

options(repr.plot.width=5, repr.plot.height=3)

us <- map_data("state")
city <- customer %>% group_by(city) %>% summarize(cnt = n())
city <- city %>% mutate(avg = cnt/mean(cnt))
city$city <- tolower(city$city)

gg <- ggplot()
gg <- gg + geom_map(data=us, map=us,
                    aes(x=long, y=lat, map_id=region),
                    fill="#ffffff", color="#ffffff", size=0.25)

gg <- gg + geom_map(data=city, map=us,
                    aes(fill=cnt, map_id=city),
                    color="#ffffff", size=0.25)
gg <- gg + scale_fill_continuous(low='thistle2', high='darkred',
                                 guide='colorbar')
gg <- gg + labs(x=NULL, y=NULL)
gg <- gg + coord_map("albers", lat0 = 39, lat1 = 45)
gg <- gg + theme(panel.border = element_blank())
gg <- gg + theme(panel.background = element_blank())
gg <- gg + theme(axis.ticks = element_blank())
gg <- gg + theme(axis.text = element_blank())
gg

timezone 별 고객 분포도¶

options(repr.plot.width=7, repr.plot.height=3)

timezone_customer <- customer %>% group_by(timezone) %>% summarize(cnt=n())
ggplot(timezone_customer,aes(x=timezone,y=cnt,fill=timezone)) + geom_bar(stat="identity")

2. State 별 고객 분포¶

state_number <- v_customer %>% group_by(state) %>% summarize(cnt = sum(cnt)) %>% arrange(desc(cnt))
ggplot(v_customer, aes(x=state,y=cnt)) + geom_bar(stat="identity") + theme(axis.text.x = element_text(angle = 90, hjust = 1))

3. State 별 Segment 별 고객 분포¶

customer$SHOPPER_SEGMENT_CODE[is.na(customer$SHOPPER_SEGMENT_CODE)] <- 0
customer$SHOPPER_SEGMENT_CODE <- factor(customer$SHOPPER_SEGMENT_CODE)
seg_customer <- customer

names(seg_customer)[4] <- "CODE"

ggplot(seg_customer,aes(x=STATE,fill=CODE)) + geom_bar() + theme(axis.text.x = element_text(angle = 90, hjust = 1))

4. segment 별 고객 분포¶

segment_customer <- customer %>% group_by(SHOPPER_SEGMENT_CODE) %>% summarize(cu_cnt = n()) %>% mutate(prob = round((cu_cnt/sum(cu_cnt)),2)*100)
segment_customer$SHOPPER_SEGMENT_CODE[is.na(segment_customer$SHOPPER_SEGMENT_CODE)] <- 0

bp<- ggplot(segment_customer, aes(x="", y=cu_cnt, fill=as.factor(SHOPPER_SEGMENT_CODE)))+
  geom_bar(width = 1, stat = "identity")
pie <- bp + coord_polar("y", start=0)
# scale_fill_brewer("Blues") + blank_theme +
pie +
  #theme(axis.text.x=element_blank())+
  geom_text(aes(y = cu_cnt/3 + c(0, cumsum(cu_cnt)[-length(cu_cnt)]),
                label = paste0(SHOPPER_SEGMENT_CODE,":",prob,"%")), size=3)

5. segment 별 구매 횟수¶

segment_order <- order_df %>% group_by(SHOPPER_SEGMENT_CODE) %>% summarize(or_cnt=n())
segment_order$percent <- round(prop.table(segment_order$or_cnt),2)

ggplot(segment_order,aes(x=SHOPPER_SEGMENT_CODE,y=or_cnt,fill=SHOPPER_SEGMENT_CODE)) + geom_bar(stat="identity",colour="black")

5.1 segment 별 회원수¶

segment_cus_order <- order_df %>% distinct(CUSTOMER_NBR,SHOPPER_SEGMENT_CODE) %>% group_by(SHOPPER_SEGMENT_CODE) %>% summarize(or_cnt=n())
 # 합쳐서
seg <- merge(segment_customer,segment_order,by="SHOPPER_SEGMENT_CODE")
seg <- seg %>% mutate(avg = round(or_cnt/cu_cnt,2))
ggplot(seg,aes(x=SHOPPER_SEGMENT_CODE,y=avg,fill=SHOPPER_SEGMENT_CODE)) + geom_bar(stat="identity",colour="black")

5.2 월별 판매 수¶

#  월별 판매 수
mon_order <- order_df %>% group_by(ORDER_MON) %>% summarize(cnt =n())
mon_order$ORDER_MON <- as.factor(mon_order$ORDER_MON)
mon_order$per <- round(prop.table(mon_order$cnt),2) * 100
ggplot(data=mon_order,aes(x=ORDER_MON,y=cnt,fill=ORDER_MON)) +geom_bar(stat="identity")

6. 요일별 주문 Count¶

v_day <- order_df %>% group_by(ORDER_DAY) %>% summarize(cnt=n())
v_day$ORDER_DAY = factor(v_day$ORDER_DAY,labels=c("SUN","MON","TUE","WED","THU","FRI","SAT"),ordered=TRUE)
ggplot(v_day, aes(x=ORDER_DAY,y=cnt,fill=ORDER_DAY)) + geom_bar(stat="identity",colour="black")

7. 카테고리별 구매 횟수 5개¶

order_per_cate <- order_df %>% group_by(MERCH_DIV_DESC) %>% summarize(cnt = n()) %>% arrange(desc(cnt))
ggplot(head(order_per_cate), aes(x=MERCH_DIV_DESC,y=cnt,fill=MERCH_DIV_DESC)) + geom_bar(stat="identity",colour="black")

8. 가장 많이 팔린 제품 상위 5개¶

order_per_product <- order_df %>% group_by(PACKING_SLIP_DESC) %>% summarize(cnt = n()) %>% arrange(desc(cnt))
ggplot(head(order_per_product), aes(x=PACKING_SLIP_DESC,y=cnt,fill=PACKING_SLIP_DESC)) + geom_bar(stat="identity")

9. 시간대별 판매 수¶

v_time_cate <- order_df %>% group_by(ORDER_HOUR,MERCH_DIV_DESC) %>% summarize(cnt = n()) # 다른 데서 line plot으로 시간대별로 다르게 표현.
time_order <- v_time_cate %>% group_by(ORDER_HOUR) %>% summarize(cnt = sum(cnt))
ggplot(data=time_order, aes(x=factor(ORDER_HOUR), y=cnt,group=1)) + geom_line(color="red") + geom_point(color="blue")

10. Timezone 별 주문 횟수¶

timezone_order <- order_df %>% group_by(timezone) %>% summarize(cnt = n()) %>% mutate(prob = round((cnt/sum(cnt)),3)*100)
timezone_order

timezone_customer <- customer %>% group_by(timezone) %>% summarize(cnt=n())
timezone_cluster_customer <- customer %>% group_by(timezone,cluster) %>% summarize(cnt = n())
timezone_segment_customer <- customer %>% group_by(timezone,SHOPPER_SEGMENT_CODE) %>% summarize(cnt=n())

df_order_timezone <- data.frame(timezone = timezone_order$timezone, timezone_cnt = timezone_order$cnt)
df_customer_timezone <- data.frame(timezone = timezone_customer$timezone, customer_cnt = timezone_customer$cnt)
df_timezone <- merge(df_customer_timezone,df_order_timezone,by="timezone")
df_timezone <- df_timezone %>% mutate(per_order = round(timezone_cnt / customer_cnt,2))

timezone_order <- left_join(timezone_order,df_timezone[,c(1,4)])

Joining, by = "timezone"
Warning message:
In left_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining factor and character vector, coercing into character vector

options(repr.plot.height=5)
p1 <- ggplot(timezone_order,aes(x=timezone,y=prob,fill=timezone)) + geom_bar(stat="identity")
p2 <- ggplot(timezone_order,aes(x=timezone,y=per_order,fill=timezone)) + geom_bar(stat="identity")
gridExtra::grid.arrange(p1, p2)

11. Segmnet 요일 구매 패턴¶

options(repr.plot.height=3)
segment_mon_order <- order_df %>% group_by(SHOPPER_SEGMENT_CODE,ORDER_DAY) %>% summarize(or_cnt=n())
segment_mon_order$ORDER_DAY = factor(segment_mon_order$ORDER_DAY,labels=c("SUN","MON","TUE","WED","THU","FRI","SAT"),ordered=TRUE)
ggplot(segment_mon_order,aes(x=SHOPPER_SEGMENT_CODE,y=or_cnt,fill=ORDER_DAY)) + geom_bar(stat="identity",position="dodge")

Warning message:
: Removed 7 rows containing missing values (geom_bar).

12 . 광고 시청 유무에 따른 구매 및 전체 광고 흐름¶

air_product <- read.csv("data/Product airtime.csv")
head(air_product)

12-1 시간대별 광고 전체¶

air_product$PRODUCT_START_TMS  <- ymd_hm(air_product$PRODUCT_START_TMS)
air_product$PRODUCT_STOP_TMS  <- ymd_hm(air_product$PRODUCT_STOP_TMS)
air_product$AIR_DATE <- ymd_hm(air_product$AIR_DATE)
air_product$START_HOUR <- hour(air_product$PRODUCT_START_TMS)
air_product$STOP_HOUR <- hour(air_product$PRODUCT_STOP_TMS)
air_product$AIR_MON <- month(air_product$AIR_DATE)
head(air_product)

  PRODUCT_ID   AIR_DATE PRODUCT_AIRTIME_MINS   PRODUCT_START_TMS
1       2186 2012-10-16                 0.38 2012-10-16 00:43:00
2       2186 2012-10-16                 5.08 2012-10-16 00:48:00
3       2478 2012-10-16                 0.50 2012-10-16 00:25:00
4       2478 2012-10-16                12.53 2012-10-16 00:31:00
5       6283 2012-10-16                 0.43 2012-10-16 00:47:00
6       6283 2012-10-16                 4.73 2012-10-16 00:53:00
     PRODUCT_STOP_TMS START_HOUR STOP_HOUR AIR_MON
1 2012-10-16 00:44:00          0         0      10
2 2012-10-16 00:53:00          0         0      10
3 2012-10-16 00:26:00          0         0      10
4 2012-10-16 00:43:00          0         0      10
5 2012-10-16 00:48:00          0         0      10
6 2012-10-16 00:58:00          0         0      10

total_air_hour <- air_product %>% group_by(START_HOUR) %>% summarize(cnt = n())
total_air_hour$type = "Total_Ad"
ggplot(total_air_hour,aes(x=factor(START_HOUR),y=cnt,group=2))  + geom_line(size = 2,color="red") + geom_point(color="blue",size=2)

# 구매 제품의 방송 시간에 따른 구매 ####
tmp_df <- order_df[!is.na(order_df$AIR_DATE),]
air_time_order <- tmp_df %>% group_by(START_HOUR) %>% summarize(cnt = n())

air_time_order$type="ConnectToSell"

air_df <- rbind(total_air_hour,air_time_order)
air_df$START_HOUR <- factor(air_df$START_HOUR)

ggplot(air_df,aes(x=START_HOUR,y=cnt)) + geom_line(aes(group=type,color=type),size = 2) + geom_point(aes(group=type,color=type),size = 2)

# 광고를 본사람 vs 안본사람 시간대 비교 ####
air_order_df <- order_df[!is.na(order_df$AIR_DATE),]
no_air_order_df <- order_df[is.na(order_df$AIR_DATE),]

air_time_order <- air_order_df %>% group_by(ORDER_HOUR) %>% summarize(cnt=n())
no_air_time_order <- no_air_order_df %>% group_by(ORDER_HOUR) %>% summarize(cnt=n())

air_time_order$type="air"
no_air_time_order$type="noair"

compare_air <- rbind(air_time_order,no_air_time_order)
ggplot(compare_air,aes(x=ORDER_HOUR,y=cnt)) + geom_line(aes(group=type,color=type),size = 2) + geom_point(aes(group=type,color=type),size = 2)

전체 주문¶

0 ~ 6시 : 타 시간대 대비 광고량 대비 구매량이 현저히 적음
10 ~ 12시 : 주문 횟수 > 광고 횟수, 광고 ↑ : 구매 ↑
20 ~ 23시 : 광고 유무 -> 구매 영향도 높음, 매출 노ㅠ은 짧은 광고 시간의 상품 노출 필요

names(air_df)[1] <- "HOUR"
names(compare_air)[1] <- "HOUR"

# 전체 주문 확보 ####
total_order <- order_df %>% group_by(ORDER_HOUR) %>% summarize(cnt=n())
names(total_order)[1] <- "HOUR"
total_order$type <- "TotalOrder"

tmp_df <- rbind(air_df,compare_air)
tmp_df <- rbind(tmp_df,total_order)
ggplot(tmp_df,aes(x=HOUR,y=cnt)) + geom_line(aes(group=type,color=type),size = 1) + geom_point(aes(group=type,color=type),size = 1)

Answer the Questions¶

1. What is the next product a customer will buy in the next month given their previous buying behavior and product airtime?¶

제품의 경우 같은 Cluster 내 고객들의 구매 내역 중 내가 사지 않은 내역을 추천

order_df <- read.csv("data/order_data_cluster.csv", stringsAsFactors = F)
customer <- read.csv("data/customer_with_cluster.csv",stringsAsFactors = F)

order_df$SHOPPER_SEGMENT_CODE[is.na(order_df$SHOPPER_SEGMENT_CODE)] <- 0
order_df$SHOPPER_SEGMENT_CODE <- as.factor(order_df$SHOPPER_SEGMENT_CODE)

for(i in 1:9){
  tmp_df <- order_df %>% filter(cluster==i)
  assign(paste0("cluster_",i),tmp_df)
}

for(i in 1:9){
  tmp_df <- get(paste0("cluster_",i))
  tmp_df <- tmp_df %>% group_by(CUSTOMER_NBR,MERCH_DIV_DESC) %>% summarize(cnt = n()) %>% arrange(CUSTOMER_NBR)
  assign(paste0("cluster_cate_",i),as.data.frame(tmp_df))
}

for(i in 1:9){
  tmp_df <- get(paste0("cluster_",i))
  tmp_df <- tmp_df %>% group_by(CUSTOMER_NBR,PRODUCT_ID) %>% summarize(cnt = n()) %>% arrange(CUSTOMER_NBR)
  assign(paste0("cluster_product_",i),as.data.frame(tmp_df))
}

id별로 추천 가져오는 Function¶

1. Product 별¶

Product 의 경우 한번 구매한 제품에 대해서 재 구매 확률이 낮으므로 Cluster별로 구매 했던 내역에서 고객이 주문한 내역을 제외한 빈도가 가장 높은 순으로 추천 하여 구매를 예측

getProductRecommand <- function(x){
  custId <- x
  custCluster <- customer[customer$CUSTOMER_NBR==custId,]$cluster
  in_df <- get(paste0("cluster_product_",custCluster))
  in_product_list <- in_df[in_df$CUSTOMER_NBR==custId,]$PRODUCT_ID
  in_product_list <- as.data.frame(in_product_list)
  other_df <- in_df %>% filter(CUSTOMER_NBR!=custId) %>% group_by(PRODUCT_ID) %>% summarize(cnt = sum(cnt)) %>% arrange(desc(cnt))
  other_df <- as.data.frame(other_df)
  other_df$percent <- round(prop.table(other_df$cnt),2)
  result_df <- other_df %>% filter(!(PRODUCT_ID %in% in_product_list$in_producut_list))
  result_df <- left_join(result_df,product)
  names(in_product_list)[1] <- "PRODUCT_ID"
  in_product_list <- left_join(in_product_list,product)
  print(in_product_list[,c(1,2)])
  print(head(result_df[,c(1,4,2,3)]))
}

2. In what product category is a customer likely to buy their next product, given their previous buying behavior and product airtime?¶

카테고리의 경우 같은 카테고리를 구매할 가능성이 높으므로 연관분석을 통해 추천

2. Category¶

getCateRecommand <- function(x){
  custId <- x
  custCluster <- customer[customer$CUSTOMER_NBR==custId,]$cluster
  if(length(custCluster)==0){
    return(0)
  }
  baskets <-  read_baskets(con  = paste0("data/item_cate_",custCluster,".txt"),sep = "#", info = c("sequenceID","eventID","SIZE"))
  baskets <- cspade(baskets, parameter = list(support = 0.0001), control = list(verbose = TRUE))
  baskets_df <- as(baskets, "data.frame")
  baskets_df <- baskets_df[order(baskets_df$support,decreasing = T),]
  baskets_df$sequence <- gsub(pattern = "<\\{\"","",baskets_df$sequence)
  baskets_df$sequence <- gsub(pattern = "\"\\}>","",baskets_df$sequence)
  baskets_df$sequence <- gsub(pattern = "\"","",baskets_df$sequence)
  last_df <- order_df[order_df$CUSTOMER_NBR==custId,] %>%
    arrange(desc(ORDER_DATES)) %>%
    select(MERCH_DIV_DESC)
  last_cate <- last_df$MERCH_DIV_DESC
  support_df <- baskets_df %>% filter(str_detect(sequence,last_cate))
  print(paste0(last_cate))
  print(head(support_df[-1,]))
}

getRecommand <- function(cate,y){
  if(cate=="cate"){
    getCateRecommand(y)
  }else{
    getProductRecommand(y)
  }
}

getRecommand("cate",1275)

parameter specification:
support : 1e-04
maxsize :    10
maxlen  :    10

algorithmic control:
bfstype  : FALSE
verbose  :  TRUE
summary  : FALSE
tidLists : FALSE

preprocessing ... 1 partition(s), 0.09 MB [0.66s]
mining transactions ... 0 MB [0.4s]
reading sequences ... [0.11s]

total elapsed time: 1.17s
[1] "Entertainment" "Health/Beauty" "Health/Beauty" "Health/Beauty"
[5] "Health/Beauty" "Health/Beauty"
                       sequence      support
2      Health/Beauty,Home Decor 0.0032706460
3 Health/Beauty,IQVC Divisional 0.0029980921
4         Health/Beauty,Apparel 0.0027255383
5     Accessories,Health/Beauty 0.0010902153
6         Health/Beauty,Jewelry 0.0008176615
7      Health/Beauty,Housewares 0.0008176615

getRecommand("product",749351)

Joining, by = "PRODUCT_ID"
Joining, by = "PRODUCT_ID"

  PRODUCT_ID                               PACKING_SLIP_DESC
1       9887      Layers by Lizden Marvelush Smooth & Ribbed
2      11916  Milana Bra by Genie Set of 2 Lace Overlay Bras
3      32338          Set of 20 Faceted Droplet Ornaments by
4      44978     UltraFine Silver 8mm Reversible Diamond Cut
5      49825 Simply Stacks Sterling 4mm Round Peridot Solita
6      49826 Simply Stacks Sterling 4mm Round Pink Tourmalin
  PRODUCT_ID                              PACKING_SLIP_DESC cnt percent
1      61109    FlashPad 2.0 Touchscreen Handheld Game with  51    0.02
2      10811 BareTraps Tall Shaft Boots w/ Ruching & Gore -  42    0.01
3      11916 Milana Bra by Genie Set of 2 Lace Overlay Bras  40    0.01
4      32565   Berkshire Blanket Super Soft Oversized Plush  31    0.01
5      32612   Bethlehem Lights Indoor Outdoor 20 ft 50 LED  29    0.01
6       9887     Layers by Lizden Marvelush Smooth & Ribbed  26    0.01

3. What are the products and categories that sell better in a particular geographic, time zone, and customer segment?¶

3-1 particular geographic¶

v3_1 <- order_df %>%
  group_by(STATE,PACKING_SLIP_DESC) %>%
  summarize(Pro_cnt=n()) %>%
  slice(which.max(Pro_cnt))

v3_1_2 <- order_df %>% group_by(STATE,MERCH_DIV_DESC) %>% summarize(Category_cnt=n()) %>% slice(which.max(Category_cnt))

head(v3_1)

head(v3_1_2)

3-2 Time Zone¶

v3_3 <- order_df %>% group_by(timezone,PACKING_SLIP_DESC) %>% summarize(Hour_P_cnt=n()) %>% slice(which.max(Hour_P_cnt))
v3_4 <- order_df %>% group_by(timezone,MERCH_DIV_DESC) %>% summarize(Hour_C_cnt=n()) %>% slice(which.max(Hour_C_cnt))

head(v3_3)

head(v3_4)

3-3 Segment¶

cs_prodcut <- order_df %>%
  group_by(SHOPPER_SEGMENT_CODE,PACKING_SLIP_DESC) %>%
  summarize(pro_cnt=n()) %>%
  slice(which.max(pro_cnt))
head(cs_prodcut)

cs_category <- order_df %>%
  group_by(SHOPPER_SEGMENT_CODE,MERCH_DIV_DESC) %>%
  summarize(cate_cnt=n()) %>%
  slice(which.max(cate_cnt))
head(cs_category)

3-4 TimeZone & Segment Cross¶

# 3. 타임존, 세그먼트별 잘팔리는 제품. 
tzsg_product <- order_df %>% group_by(timezone,SHOPPER_SEGMENT_CODE,PACKING_SLIP_DESC) %>% summarize(tzsg_pro_cnt=n()) %>% slice(which.max(tzsg_pro_cnt))
tzsg_category <- order_df %>% group_by(timezone,SHOPPER_SEGMENT_CODE,MERCH_DIV_DESC) %>% summarize(tzsg_cate_cnt=n()) %>% slice(which.max(tzsg_cate_cnt))
tzsg_product$SHOPPER_SEGMENT_CODE[is.na(tzsg_product$SHOPPER_SEGMENT_CODE)] <- 0
tzsg_category$SHOPPER_SEGMENT_CODE[is.na(tzsg_category$SHOPPER_SEGMENT_CODE)] <- 0

head(tzsg_category)

tail(tzsg_category)

4. Is there a best time of day to sell a particular product or product category?¶

product_per_hour <- order_df %>%
  group_by(PACKING_SLIP_DESC,ORDER_HOUR) %>%
  summarize(pro_cnt=n()) %>%
  slice(which.max(pro_cnt)) %>%
  filter(pro_cnt>10) %>% arrange(ORDER_HOUR)

head(product_per_hour)

5.What is the brand affinity (personal connection with the brands QVC sells) for QVC’s different customer segments?¶

간단하게 답변한다면 Segment별로 좋아하는 Brand를 가지고 오면 된다.
하지만 각 브랜드별로 어떤 상관관계가 있고 구매를 촉진 시키는지 Cross Selling이 가능한지 여부를 확인해야한다.

segment_brand <- order_df %>% group_by(SHOPPER_SEGMENT_CODE,BRAND_NAME) %>% summarize(cnt = n()) %>% arrange(SHOPPER_SEGMENT_CODE,desc(cnt))
segment_brand_max <- order_df %>% filter(BRAND_NAME!="Not Known" & BRAND_NAME!="N/A") %>% group_by(SHOPPER_SEGMENT_CODE,BRAND_NAME) %>% summarize(cnt = n()) %>% slice(which.max(cnt))
segment_brand_max$SHOPPER_SEGMENT_CODE[is.na(segment_brand_max$SHOPPER_SEGMENT_CODE)] <- 0

segment_brand_max

브랜드별 상관관계¶

상위 20개 브랜드¶

count_brand <- order_df %>% group_by(BRAND_NAME) %>% summarize(cnt = n()) %>% arrange(desc(cnt)) %>% head(20)

brand_list <- count_brand$BRAND_NAME
brand_list <- as.data.frame(brand_list)
brand_rel <- order_df %>% filter(BRAND_NAME %in% brand_list$brand_list & BRAND_NAME != "Not Known" & BRAND_NAME != "N/A") %>% group_by(CUSTOMER_NBR,BRAND_NAME) %>% summarize(cnt = n())
brand_rel <- dcast(CUSTOMER_NBR ~ BRAND_NAME, data = brand_rel, value.var = "cnt")
brand_rel[is.na(brand_rel)] <- 0
brand_cor <- round(cor(brand_rel),2)
brand_cor <- as.data.frame(brand_cor)

library(psych)

options(repr.plot.width=8, repr.plot.height=8)
pairs.panels(brand_rel)

brand_cor

	STATE	ZIP_CODE	CUSTOMER_NBR	SHOPPER_SEGMENT_CODE	city	latitude	longitude	timezone
1	AK	99501	442649	2	Anchorage	61.216799	-149.87828	US/Alaska
2	AK	99501	657615	3	Anchorage	61.216799	-149.87828	US/Alaska
3	AK	99501	315353	1	Anchorage	61.216799	-149.87828	US/Alaska
4	AK	99501	235837	2	Anchorage	61.216799	-149.87828	US/Alaska
5	AK	99502	516194	3	Anchorage	61.153693	-149.95932	US/Alaska
6	AK	99502	224909	0	Anchorage	61.153693	-149.95932	US/Alaska

	ZIP_CODE	cnt	state	latitude	longitude
1	1001	8	MA	42.07061	-72.62029
2	1007	6	MA	42.278424	-72.411
3	1013	9	MA	42.161492	-72.60316
4	1020	9	MA	42.175992	-72.57265
5	1027	6	MA	42.27279	-72.68677
6	1028	8	MA	42.066795	-72.5102

	timezone	cnt	prob
1	CST	8791	23.4
2	EST	21504	57.2
3	MST	1889	5
4	PST	5201	13.8
5	US/Alaska	151	0.4
6	US/Hawaii	82	0.2

	PRODUCT_ID	AIR_DATE	PRODUCT_AIRTIME_MINS	PRODUCT_START_TMS	PRODUCT_STOP_TMS
1	2186	2012-10-16 0:00	0.38	2012-10-16 0:43	2012-10-16 0:44
2	2186	2012-10-16 0:00	5.08	2012-10-16 0:48	2012-10-16 0:53
3	2478	2012-10-16 0:00	0.5	2012-10-16 0:25	2012-10-16 0:26
4	2478	2012-10-16 0:00	12.53	2012-10-16 0:31	2012-10-16 0:43
5	6283	2012-10-16 0:00	0.43	2012-10-16 0:47	2012-10-16 0:48
6	6283	2012-10-16 0:00	4.73	2012-10-16 0:53	2012-10-16 0:58

	timezone	MERCH_DIV_DESC	Hour_C_cnt
1	CST6CDT	Home Decor	1855
2	EST5EDT	Home Decor	4656
3	MST	Home Decor	91
4	MST7MDT	Home Decor	334
5	PST8PDT	Apparel	1133
6	US/Alaska	Home Decor	34

	STATE	PACKING_SLIP_DESC	Pro_cnt
1	AK	Berkshire Blanket Super Soft Oversized Plush	11
2	AL	BareTraps Tall Shaft Boots w/ Ruching & Gore -	8
3	AR	Berkshire Blanket Super Soft Oversized Plush	9
4	AZ	WEN by Chaz Dean Fall Ginger Pumpkin	7
5	CA	Berkshire Blanket Super Soft Oversized Plush	41
6	CO	Berkshire Blanket Super Soft Oversized Plush	12

	SHOPPER_SEGMENT_CODE	PACKING_SLIP_DESC	pro_cnt
1	1	FlashPad 2.0 Touchscreen Handheld Game with	86
2	2	Bethlehem Lights Product Recall Packet	26
3	3	Bethlehem Lights Product Recall Packet	77
4	4	FlashPad 2.0 Touchscreen Handheld Game with	335
5	5	Berkshire Blanket Super Soft Oversized Plush	34
6	NA	FlashPad 2.0 Touchscreen Handheld Game with	82

	PACKING_SLIP_DESC	pro_cnt
1	Berkshire Blanket Super Soft Oversized Plush	132
2	Duraflame 750W/1500W Electric Stove Heater with	17
3	FlashPad 2.0 Touchscreen Handheld Game with	66
4	INFOtainment Apple iPad Charging and Docking	33
5	Josie Maran Art of Giving 8-pc Argan Bath &	20
6	Judith Ripka Stainless Steel Chronograph Watch	16

	SHOPPER_SEGMENT_CODE	BRAND_NAME	cnt
1	1	WEN	218
2	2	Temp-tations	46
3	3	Denim & Co.	107
4	4	Denim & Co.	1857
5	5	WEN	95
6	0	Susan Graver	246

	CUSTOMER_NBR	BareTraps	Bethlehem Lights	Clarks	Denim & Co.	Emjoi	Flash Pad	Genie Bra	Isaac Mizrahi Live!	Josie Maran	Liz Claiborne New York	Lizden	Mally Beauty	philosophy	Quacker Factory	Susan Graver	Temp-tations	Valerie Parr Hill	WEN
CUSTOMER_NBR	1.00	0.01	0.00	-0.02	0.02	0.00	-0.01	0.02	0.02	0.00	0.00	0.01	-0.01	0.01	0.00	0.03	0.02	-0.02	0.01
BareTraps	0.01	1.00	0.02	-0.01	0.04	0.01	0.02	0.00	0.04	0.00	0.01	0.03	-0.01	0.00	-0.01	0.00	0.01	0.04	-0.04
Bethlehem Lights	0.00	0.02	1.00	0.01	0.07	-0.02	0.06	0.04	0.05	0.02	0.02	0.03	0.02	0.01	0.03	0.02	0.05	0.13	-0.02
Clarks	-0.02	-0.01	0.01	1.00	0.05	0.01	-0.01	0.02	0.02	0.00	0.04	0.04	-0.02	0.01	0.00	0.04	-0.01	0.01	-0.03
Denim & Co.	0.02	0.04	0.07	0.05	1.00	0.01	-0.01	0.06	0.11	0.02	0.22	0.14	0.05	0.10	0.24	0.36	0.04	0.11	-0.01
Emjoi	0.00	0.01	-0.02	0.01	0.01	1.00	0.03	0.01	0.01	0.00	0.02	0.02	0.00	-0.01	-0.01	0.03	-0.01	0.02	-0.03
Flash Pad	-0.01	0.02	0.06	-0.01	-0.01	0.03	1.00	0.06	0.01	0.00	-0.02	0.03	-0.02	-0.03	0.01	-0.01	-0.03	0.01	-0.03
Genie Bra	0.02	0.00	0.04	0.02	0.06	0.01	0.06	1.00	0.09	0.03	0.11	0.10	0.02	0.02	0.05	0.13	0.01	0.01	-0.03
Isaac Mizrahi Live!	0.02	0.04	0.05	0.02	0.11	0.01	0.01	0.09	1.00	0.03	0.28	0.11	0.02	0.06	0.09	0.17	0.01	0.01	-0.01
Josie Maran	0.00	0.00	0.02	0.00	0.02	0.00	0.00	0.03	0.03	1.00	0.03	0.02	0.08	0.06	-0.01	0.01	-0.01	0.05	0.04
Liz Claiborne New York	0.00	0.01	0.02	0.04	0.22	0.02	-0.02	0.11	0.28	0.03	1.00	0.16	0.05	0.02	0.23	0.44	0.01	0.02	-0.01
Lizden	0.01	0.03	0.03	0.04	0.14	0.02	0.03	0.10	0.11	0.02	0.16	1.00	0.02	0.04	0.12	0.21	0.02	0.07	-0.01
Mally Beauty	-0.01	-0.01	0.02	-0.02	0.05	0.00	-0.02	0.02	0.02	0.08	0.05	0.02	1.00	0.04	0.06	0.08	-0.01	0.04	0.01
philosophy	0.01	0.00	0.01	0.01	0.10	-0.01	-0.03	0.02	0.06	0.06	0.02	0.04	0.04	1.00	0.00	0.03	0.00	0.06	0.00
Quacker Factory	0.00	-0.01	0.03	0.00	0.24	-0.01	0.01	0.05	0.09	-0.01	0.23	0.12	0.06	0.00	1.00	0.37	0.01	0.04	-0.03
Susan Graver	0.03	0.00	0.02	0.04	0.36	0.03	-0.01	0.13	0.17	0.01	0.44	0.21	0.08	0.03	0.37	1.00	0.01	0.03	-0.02
Temp-tations	0.02	0.01	0.05	-0.01	0.04	-0.01	-0.03	0.01	0.01	-0.01	0.01	0.02	-0.01	0.00	0.01	0.01	1.00	0.11	-0.03
Valerie Parr Hill	-0.02	0.04	0.13	0.01	0.11	0.02	0.01	0.01	0.01	0.05	0.02	0.07	0.04	0.06	0.04	0.03	0.11	1.00	-0.02
WEN	0.01	-0.04	-0.02	-0.03	-0.01	-0.03	-0.03	-0.03	-0.01	0.04	-0.01	-0.01	0.01	0.00	-0.03	-0.02	-0.03	-0.02	1.00

QVC Analysis (E-Commerce 판매 데이터 분석)¶

발표 자료¶

필요한 파일.¶

Data Load¶

Data 확인.¶

1. 고객 분포도. Zip Code 별 고객 분포도.¶

지도 그리기¶

지도그리기 2¶

timezone 별 고객 분포도¶

2. State 별 고객 분포¶

3. State 별 Segment 별 고객 분포¶

4. segment 별 고객 분포¶

5. segment 별 구매 횟수¶

5.1 segment 별 회원수¶

5.2 월별 판매 수¶

6. 요일별 주문 Count¶

7. 카테고리별 구매 횟수 5개¶

8. 가장 많이 팔린 제품 상위 5개¶

9. 시간대별 판매 수¶

10. Timezone 별 주문 횟수¶

11. Segmnet 요일 구매 패턴¶

12 . 광고 시청 유무에 따른 구매 및 전체 광고 흐름¶

12-1 시간대별 광고 전체¶

전체 주문¶

Answer the Questions¶

1. What is the next product a customer will buy in the next month given their previous buying behavior and product airtime?¶

id별로 추천 가져오는 Function¶

1. Product 별¶

2. In what product category is a customer likely to buy their next product, given their previous buying behavior and product airtime?¶

2. Category¶

3. What are the products and categories that sell better in a particular geographic, time zone, and customer segment?¶

3-1 particular geographic¶

3-2 Time Zone¶

3-3 Segment¶

3-4 TimeZone & Segment Cross¶

4. Is there a best time of day to sell a particular product or product category?¶

5.What is the brand affinity (personal connection with the brands QVC sells) for QVC’s different customer segments?¶

브랜드별 상관관계¶

상위 20개 브랜드¶

결론 : 최적의 방송시간 및 마케팅전략¶