library(stringr)
library(arules)
library(dplyr)
library(maps)
library(ggplot2)
library(ggmap)
library(gridExtra)
library(reshape2)
library(arulesSequences)
library(lubridate)
theme_set(theme_gray(base_size = 13))
customer <- read.csv("data/Customer master2.csv")
customer$SHOPPER_SEGMENT_CODE <- as.factor(customer$SHOPPER_SEGMENT_CODE)
head(customer)
order_df <- read.csv("data/order_data.csv", stringsAsFactors = F)
order_df$SHOPPER_SEGMENT_CODE[is.na(order_df$SHOPPER_SEGMENT_CODE)] <- 0
order_df$SHOPPER_SEGMENT_CODE <- as.factor(order_df$SHOPPER_SEGMENT_CODE)
product <- read.csv("data/Product master2.csv", stringsAsFactor = F)
zip <- read.csv("data/zipcode.csv", stringsAsFactors = F)
v_customer <- customer %>% group_by(ZIP_CODE) %>% summarize(cnt = n()) %>% filter(cnt > 5)
v_customer <- merge(v_customer,zip[,c(1,3:5)],by.x="ZIP_CODE",by.y="zip")
head(v_customer)
options(repr.plot.width=5, repr.plot.height=5)
map<-get_map(location='united states', zoom=4, maptype = "terrain",
source='google',color='color')
ggmap(map) + geom_point(
aes(x=longitude, y=latitude, show_guide = TRUE, colour=cnt, size=cnt),
data=v_customer, alpha=1,na.rm = T) +
scale_color_gradient(low="beige", high="blue")
options(repr.plot.width=5, repr.plot.height=3)
us <- map_data("state")
city <- customer %>% group_by(city) %>% summarize(cnt = n())
city <- city %>% mutate(avg = cnt/mean(cnt))
city$city <- tolower(city$city)
gg <- ggplot()
gg <- gg + geom_map(data=us, map=us,
aes(x=long, y=lat, map_id=region),
fill="#ffffff", color="#ffffff", size=0.25)
gg <- gg + geom_map(data=city, map=us,
aes(fill=cnt, map_id=city),
color="#ffffff", size=0.25)
gg <- gg + scale_fill_continuous(low='thistle2', high='darkred',
guide='colorbar')
gg <- gg + labs(x=NULL, y=NULL)
gg <- gg + coord_map("albers", lat0 = 39, lat1 = 45)
gg <- gg + theme(panel.border = element_blank())
gg <- gg + theme(panel.background = element_blank())
gg <- gg + theme(axis.ticks = element_blank())
gg <- gg + theme(axis.text = element_blank())
gg
options(repr.plot.width=7, repr.plot.height=3)
timezone_customer <- customer %>% group_by(timezone) %>% summarize(cnt=n())
ggplot(timezone_customer,aes(x=timezone,y=cnt,fill=timezone)) + geom_bar(stat="identity")
state_number <- v_customer %>% group_by(state) %>% summarize(cnt = sum(cnt)) %>% arrange(desc(cnt))
ggplot(v_customer, aes(x=state,y=cnt)) + geom_bar(stat="identity") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
customer$SHOPPER_SEGMENT_CODE[is.na(customer$SHOPPER_SEGMENT_CODE)] <- 0
customer$SHOPPER_SEGMENT_CODE <- factor(customer$SHOPPER_SEGMENT_CODE)
seg_customer <- customer
names(seg_customer)[4] <- "CODE"
ggplot(seg_customer,aes(x=STATE,fill=CODE)) + geom_bar() + theme(axis.text.x = element_text(angle = 90, hjust = 1))
segment_customer <- customer %>% group_by(SHOPPER_SEGMENT_CODE) %>% summarize(cu_cnt = n()) %>% mutate(prob = round((cu_cnt/sum(cu_cnt)),2)*100)
segment_customer$SHOPPER_SEGMENT_CODE[is.na(segment_customer$SHOPPER_SEGMENT_CODE)] <- 0
bp<- ggplot(segment_customer, aes(x="", y=cu_cnt, fill=as.factor(SHOPPER_SEGMENT_CODE)))+
geom_bar(width = 1, stat = "identity")
pie <- bp + coord_polar("y", start=0)
# scale_fill_brewer("Blues") + blank_theme +
pie +
#theme(axis.text.x=element_blank())+
geom_text(aes(y = cu_cnt/3 + c(0, cumsum(cu_cnt)[-length(cu_cnt)]),
label = paste0(SHOPPER_SEGMENT_CODE,":",prob,"%")), size=3)
segment_order <- order_df %>% group_by(SHOPPER_SEGMENT_CODE) %>% summarize(or_cnt=n())
segment_order$percent <- round(prop.table(segment_order$or_cnt),2)
ggplot(segment_order,aes(x=SHOPPER_SEGMENT_CODE,y=or_cnt,fill=SHOPPER_SEGMENT_CODE)) + geom_bar(stat="identity",colour="black")
segment_cus_order <- order_df %>% distinct(CUSTOMER_NBR,SHOPPER_SEGMENT_CODE) %>% group_by(SHOPPER_SEGMENT_CODE) %>% summarize(or_cnt=n())
# 합쳐서
seg <- merge(segment_customer,segment_order,by="SHOPPER_SEGMENT_CODE")
seg <- seg %>% mutate(avg = round(or_cnt/cu_cnt,2))
ggplot(seg,aes(x=SHOPPER_SEGMENT_CODE,y=avg,fill=SHOPPER_SEGMENT_CODE)) + geom_bar(stat="identity",colour="black")
# 월별 판매 수
mon_order <- order_df %>% group_by(ORDER_MON) %>% summarize(cnt =n())
mon_order$ORDER_MON <- as.factor(mon_order$ORDER_MON)
mon_order$per <- round(prop.table(mon_order$cnt),2) * 100
ggplot(data=mon_order,aes(x=ORDER_MON,y=cnt,fill=ORDER_MON)) +geom_bar(stat="identity")
v_day <- order_df %>% group_by(ORDER_DAY) %>% summarize(cnt=n())
v_day$ORDER_DAY = factor(v_day$ORDER_DAY,labels=c("SUN","MON","TUE","WED","THU","FRI","SAT"),ordered=TRUE)
ggplot(v_day, aes(x=ORDER_DAY,y=cnt,fill=ORDER_DAY)) + geom_bar(stat="identity",colour="black")
order_per_cate <- order_df %>% group_by(MERCH_DIV_DESC) %>% summarize(cnt = n()) %>% arrange(desc(cnt))
ggplot(head(order_per_cate), aes(x=MERCH_DIV_DESC,y=cnt,fill=MERCH_DIV_DESC)) + geom_bar(stat="identity",colour="black")
order_per_product <- order_df %>% group_by(PACKING_SLIP_DESC) %>% summarize(cnt = n()) %>% arrange(desc(cnt))
ggplot(head(order_per_product), aes(x=PACKING_SLIP_DESC,y=cnt,fill=PACKING_SLIP_DESC)) + geom_bar(stat="identity")
v_time_cate <- order_df %>% group_by(ORDER_HOUR,MERCH_DIV_DESC) %>% summarize(cnt = n()) # 다른 데서 line plot으로 시간대별로 다르게 표현.
time_order <- v_time_cate %>% group_by(ORDER_HOUR) %>% summarize(cnt = sum(cnt))
ggplot(data=time_order, aes(x=factor(ORDER_HOUR), y=cnt,group=1)) + geom_line(color="red") + geom_point(color="blue")
timezone_order <- order_df %>% group_by(timezone) %>% summarize(cnt = n()) %>% mutate(prob = round((cnt/sum(cnt)),3)*100)
timezone_order
timezone_customer <- customer %>% group_by(timezone) %>% summarize(cnt=n())
timezone_cluster_customer <- customer %>% group_by(timezone,cluster) %>% summarize(cnt = n())
timezone_segment_customer <- customer %>% group_by(timezone,SHOPPER_SEGMENT_CODE) %>% summarize(cnt=n())
df_order_timezone <- data.frame(timezone = timezone_order$timezone, timezone_cnt = timezone_order$cnt)
df_customer_timezone <- data.frame(timezone = timezone_customer$timezone, customer_cnt = timezone_customer$cnt)
df_timezone <- merge(df_customer_timezone,df_order_timezone,by="timezone")
df_timezone <- df_timezone %>% mutate(per_order = round(timezone_cnt / customer_cnt,2))
timezone_order <- left_join(timezone_order,df_timezone[,c(1,4)])
options(repr.plot.height=5)
p1 <- ggplot(timezone_order,aes(x=timezone,y=prob,fill=timezone)) + geom_bar(stat="identity")
p2 <- ggplot(timezone_order,aes(x=timezone,y=per_order,fill=timezone)) + geom_bar(stat="identity")
gridExtra::grid.arrange(p1, p2)
options(repr.plot.height=3)
segment_mon_order <- order_df %>% group_by(SHOPPER_SEGMENT_CODE,ORDER_DAY) %>% summarize(or_cnt=n())
segment_mon_order$ORDER_DAY = factor(segment_mon_order$ORDER_DAY,labels=c("SUN","MON","TUE","WED","THU","FRI","SAT"),ordered=TRUE)
ggplot(segment_mon_order,aes(x=SHOPPER_SEGMENT_CODE,y=or_cnt,fill=ORDER_DAY)) + geom_bar(stat="identity",position="dodge")
air_product <- read.csv("data/Product airtime.csv")
head(air_product)
air_product$PRODUCT_START_TMS <- ymd_hm(air_product$PRODUCT_START_TMS)
air_product$PRODUCT_STOP_TMS <- ymd_hm(air_product$PRODUCT_STOP_TMS)
air_product$AIR_DATE <- ymd_hm(air_product$AIR_DATE)
air_product$START_HOUR <- hour(air_product$PRODUCT_START_TMS)
air_product$STOP_HOUR <- hour(air_product$PRODUCT_STOP_TMS)
air_product$AIR_MON <- month(air_product$AIR_DATE)
head(air_product)
total_air_hour <- air_product %>% group_by(START_HOUR) %>% summarize(cnt = n())
total_air_hour$type = "Total_Ad"
ggplot(total_air_hour,aes(x=factor(START_HOUR),y=cnt,group=2)) + geom_line(size = 2,color="red") + geom_point(color="blue",size=2)
# 구매 제품의 방송 시간에 따른 구매 ####
tmp_df <- order_df[!is.na(order_df$AIR_DATE),]
air_time_order <- tmp_df %>% group_by(START_HOUR) %>% summarize(cnt = n())
air_time_order$type="ConnectToSell"
air_df <- rbind(total_air_hour,air_time_order)
air_df$START_HOUR <- factor(air_df$START_HOUR)
ggplot(air_df,aes(x=START_HOUR,y=cnt)) + geom_line(aes(group=type,color=type),size = 2) + geom_point(aes(group=type,color=type),size = 2)
# 광고를 본사람 vs 안본사람 시간대 비교 ####
air_order_df <- order_df[!is.na(order_df$AIR_DATE),]
no_air_order_df <- order_df[is.na(order_df$AIR_DATE),]
air_time_order <- air_order_df %>% group_by(ORDER_HOUR) %>% summarize(cnt=n())
no_air_time_order <- no_air_order_df %>% group_by(ORDER_HOUR) %>% summarize(cnt=n())
air_time_order$type="air"
no_air_time_order$type="noair"
compare_air <- rbind(air_time_order,no_air_time_order)
ggplot(compare_air,aes(x=ORDER_HOUR,y=cnt)) + geom_line(aes(group=type,color=type),size = 2) + geom_point(aes(group=type,color=type),size = 2)
names(air_df)[1] <- "HOUR"
names(compare_air)[1] <- "HOUR"
# 전체 주문 확보 ####
total_order <- order_df %>% group_by(ORDER_HOUR) %>% summarize(cnt=n())
names(total_order)[1] <- "HOUR"
total_order$type <- "TotalOrder"
tmp_df <- rbind(air_df,compare_air)
tmp_df <- rbind(tmp_df,total_order)
ggplot(tmp_df,aes(x=HOUR,y=cnt)) + geom_line(aes(group=type,color=type),size = 1) + geom_point(aes(group=type,color=type),size = 1)
order_df <- read.csv("data/order_data_cluster.csv", stringsAsFactors = F)
customer <- read.csv("data/customer_with_cluster.csv",stringsAsFactors = F)
order_df$SHOPPER_SEGMENT_CODE[is.na(order_df$SHOPPER_SEGMENT_CODE)] <- 0
order_df$SHOPPER_SEGMENT_CODE <- as.factor(order_df$SHOPPER_SEGMENT_CODE)
for(i in 1:9){
tmp_df <- order_df %>% filter(cluster==i)
assign(paste0("cluster_",i),tmp_df)
}
for(i in 1:9){
tmp_df <- get(paste0("cluster_",i))
tmp_df <- tmp_df %>% group_by(CUSTOMER_NBR,MERCH_DIV_DESC) %>% summarize(cnt = n()) %>% arrange(CUSTOMER_NBR)
assign(paste0("cluster_cate_",i),as.data.frame(tmp_df))
}
for(i in 1:9){
tmp_df <- get(paste0("cluster_",i))
tmp_df <- tmp_df %>% group_by(CUSTOMER_NBR,PRODUCT_ID) %>% summarize(cnt = n()) %>% arrange(CUSTOMER_NBR)
assign(paste0("cluster_product_",i),as.data.frame(tmp_df))
}
getProductRecommand <- function(x){
custId <- x
custCluster <- customer[customer$CUSTOMER_NBR==custId,]$cluster
in_df <- get(paste0("cluster_product_",custCluster))
in_product_list <- in_df[in_df$CUSTOMER_NBR==custId,]$PRODUCT_ID
in_product_list <- as.data.frame(in_product_list)
other_df <- in_df %>% filter(CUSTOMER_NBR!=custId) %>% group_by(PRODUCT_ID) %>% summarize(cnt = sum(cnt)) %>% arrange(desc(cnt))
other_df <- as.data.frame(other_df)
other_df$percent <- round(prop.table(other_df$cnt),2)
result_df <- other_df %>% filter(!(PRODUCT_ID %in% in_product_list$in_producut_list))
result_df <- left_join(result_df,product)
names(in_product_list)[1] <- "PRODUCT_ID"
in_product_list <- left_join(in_product_list,product)
print(in_product_list[,c(1,2)])
print(head(result_df[,c(1,4,2,3)]))
}
getCateRecommand <- function(x){
custId <- x
custCluster <- customer[customer$CUSTOMER_NBR==custId,]$cluster
if(length(custCluster)==0){
return(0)
}
baskets <- read_baskets(con = paste0("data/item_cate_",custCluster,".txt"),sep = "#", info = c("sequenceID","eventID","SIZE"))
baskets <- cspade(baskets, parameter = list(support = 0.0001), control = list(verbose = TRUE))
baskets_df <- as(baskets, "data.frame")
baskets_df <- baskets_df[order(baskets_df$support,decreasing = T),]
baskets_df$sequence <- gsub(pattern = "<\\{\"","",baskets_df$sequence)
baskets_df$sequence <- gsub(pattern = "\"\\}>","",baskets_df$sequence)
baskets_df$sequence <- gsub(pattern = "\"","",baskets_df$sequence)
last_df <- order_df[order_df$CUSTOMER_NBR==custId,] %>%
arrange(desc(ORDER_DATES)) %>%
select(MERCH_DIV_DESC)
last_cate <- last_df$MERCH_DIV_DESC
support_df <- baskets_df %>% filter(str_detect(sequence,last_cate))
print(paste0(last_cate))
print(head(support_df[-1,]))
}
getRecommand <- function(cate,y){
if(cate=="cate"){
getCateRecommand(y)
}else{
getProductRecommand(y)
}
}
getRecommand("cate",1275)
getRecommand("product",749351)
v3_1 <- order_df %>%
group_by(STATE,PACKING_SLIP_DESC) %>%
summarize(Pro_cnt=n()) %>%
slice(which.max(Pro_cnt))
v3_1_2 <- order_df %>% group_by(STATE,MERCH_DIV_DESC) %>% summarize(Category_cnt=n()) %>% slice(which.max(Category_cnt))
head(v3_1)
head(v3_1_2)
v3_3 <- order_df %>% group_by(timezone,PACKING_SLIP_DESC) %>% summarize(Hour_P_cnt=n()) %>% slice(which.max(Hour_P_cnt))
v3_4 <- order_df %>% group_by(timezone,MERCH_DIV_DESC) %>% summarize(Hour_C_cnt=n()) %>% slice(which.max(Hour_C_cnt))
head(v3_3)
head(v3_4)
cs_prodcut <- order_df %>%
group_by(SHOPPER_SEGMENT_CODE,PACKING_SLIP_DESC) %>%
summarize(pro_cnt=n()) %>%
slice(which.max(pro_cnt))
head(cs_prodcut)
cs_category <- order_df %>%
group_by(SHOPPER_SEGMENT_CODE,MERCH_DIV_DESC) %>%
summarize(cate_cnt=n()) %>%
slice(which.max(cate_cnt))
head(cs_category)
# 3. 타임존, 세그먼트별 잘팔리는 제품.
tzsg_product <- order_df %>% group_by(timezone,SHOPPER_SEGMENT_CODE,PACKING_SLIP_DESC) %>% summarize(tzsg_pro_cnt=n()) %>% slice(which.max(tzsg_pro_cnt))
tzsg_category <- order_df %>% group_by(timezone,SHOPPER_SEGMENT_CODE,MERCH_DIV_DESC) %>% summarize(tzsg_cate_cnt=n()) %>% slice(which.max(tzsg_cate_cnt))
tzsg_product$SHOPPER_SEGMENT_CODE[is.na(tzsg_product$SHOPPER_SEGMENT_CODE)] <- 0
tzsg_category$SHOPPER_SEGMENT_CODE[is.na(tzsg_category$SHOPPER_SEGMENT_CODE)] <- 0
head(tzsg_category)
tail(tzsg_category)
product_per_hour <- order_df %>%
group_by(PACKING_SLIP_DESC,ORDER_HOUR) %>%
summarize(pro_cnt=n()) %>%
slice(which.max(pro_cnt)) %>%
filter(pro_cnt>10) %>% arrange(ORDER_HOUR)
head(product_per_hour)
segment_brand <- order_df %>% group_by(SHOPPER_SEGMENT_CODE,BRAND_NAME) %>% summarize(cnt = n()) %>% arrange(SHOPPER_SEGMENT_CODE,desc(cnt))
segment_brand_max <- order_df %>% filter(BRAND_NAME!="Not Known" & BRAND_NAME!="N/A") %>% group_by(SHOPPER_SEGMENT_CODE,BRAND_NAME) %>% summarize(cnt = n()) %>% slice(which.max(cnt))
segment_brand_max$SHOPPER_SEGMENT_CODE[is.na(segment_brand_max$SHOPPER_SEGMENT_CODE)] <- 0
segment_brand_max
count_brand <- order_df %>% group_by(BRAND_NAME) %>% summarize(cnt = n()) %>% arrange(desc(cnt)) %>% head(20)
brand_list <- count_brand$BRAND_NAME
brand_list <- as.data.frame(brand_list)
brand_rel <- order_df %>% filter(BRAND_NAME %in% brand_list$brand_list & BRAND_NAME != "Not Known" & BRAND_NAME != "N/A") %>% group_by(CUSTOMER_NBR,BRAND_NAME) %>% summarize(cnt = n())
brand_rel <- dcast(CUSTOMER_NBR ~ BRAND_NAME, data = brand_rel, value.var = "cnt")
brand_rel[is.na(brand_rel)] <- 0
brand_cor <- round(cor(brand_rel),2)
brand_cor <- as.data.frame(brand_cor)
library(psych)
options(repr.plot.width=8, repr.plot.height=8)
pairs.panels(brand_rel)
brand_cor