Clustering Analysis based on User Characteristics
engagement levels, utilizing data
Source Data: http://youngyoon.me/archives/30
Objectives
Conducted clustering using various user metrics to identify unique common characteristics within each cluster. This analysis was utilized for planning future retention rate events. The variables used to segment users were as follows: User Level, Quarterly Attendance, Quarterly Payment, and Combat Power.
*Given the nature of free-to-play games, there are many non-paying users. To achieve more accurate clustering, we initially excluded the data of non-paying users.
.
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
# Matplotlib setting font
plt.rc('font', family='NanumGothic')
print(plt.rcParams['font.family'])
# Data load
raw_data = pd.read_excel("./data/231222_최종.xlsx")
raw_data.columns = ["ID", "Attendance", "Level", "Quater_paid", "Combatpower", "dungeonplay"]
# Filling in missing data
raw_data.fillna(0, inplace = True)
# Checking the missing data
raw_data.isnull().sum()
# Checking the Data
raw_data.head()
# Checking the Dimensions of Data
raw_data.shape
# Checking the Data Types of a DataFrame
raw_data.info()
# Removing Columns with a Level "-1"
cond2 = (raw_data["Level"] != -1)
data = raw_data[cond2]
data.shape
# Defining Three Columns
Attendance = np.array(data["Attendance"])
Level = np.array(data["Level"])
Quater_paid = np.array(data["Quater_paid"])
Combatpower = np.array(data["Combatpower"])
dungeonplay = np.array(data["dungeonplay"])
Distribution of data
I checked the basic distribution of data using a Scatter Plot.
fig = plt.figure(figsize=(6, 6))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(Attendance, Level, Combatpower, s=20, alpha=0.5)
plt.show()
fig = plt.figure(figsize=(6, 6))
plt.scatter(Combatpower, Attendance, s=20, alpha=0.5)
plt.xlabel("분기 과금액") # Quarterly Payment
plt.ylabel("출석일") # Attendance
plt.show()
fig = plt.figure(figsize=(6, 6))
plt.scatter(Level, Attendance, s=20, alpha=0.5)
plt.xlabel("레벨") # Player Level
plt.ylabel("출석일") # Attendance
plt.show()
ml_data = data[["Level", "Attendance", "Combatpower"]]
from sklearn.cluster import MiniBatchKMeans
# # of Clusters
K = 3
# Model definition
model = MiniBatchKMeans(n_clusters=K, random_state=1)
model.fit(ml_data)
ml_data["Cluster"] = model.fit_predict(ml_data)
clustered_0_level = ml_data[ml_data["Cluster"] == 0]["Level"]
clustered_0_attendance = ml_data[ml_data["Cluster"] == 0]["Attendance"]
clustered_0_combatpower = ml_data[ml_data["Cluster"] == 0]["Combatpower"]
clustered_0 = pd.DataFrame({"Level":clustered_0_level, "Attendance":clustered_0_attendance, "Combatpower":clustered_0_combatpower, "Cluster":0}).reset_index(drop=True)
clustered_1_level = ml_data[ml_data["Cluster"] == 1]["Level"]
clustered_1_attendance = ml_data[ml_data["Cluster"] == 1]["Attendance"]
clustered_1_combatpower = ml_data[ml_data["Cluster"] == 1]["Combatpower"]
clustered_1 = pd.DataFrame({"Level":clustered_1_level, "Attendance":clustered_1_attendance, "Combatpower":clustered_1_combatpower, "Cluster":1}).reset_index(drop=True)
clustered_2_level = ml_data[ml_data["Cluster"] == 2]["Level"]
clustered_2_attendance = ml_data[ml_data["Cluster"] == 2]["Attendance"]
clustered_2_combatpower = ml_data[ml_data["Cluster"] == 2]["Combatpower"]
clustered_2 = pd.DataFrame({"Level":clustered_2_level, "Attendance":clustered_2_attendance, "Combatpower":clustered_2_combatpower, "Cluster":2}).reset_index(drop=True)
clustered_data = pd.concat([clustered_0, clustered_1, clustered_2], axis = 0)
clustered_0_level = ml_data[ml_data["Cluster"] == 0]["Level"]
clustered_0_attendance = ml_data[ml_data["Cluster"] == 0]["Attendance"]
clustered_0_combatpower = ml_data[ml_data["Cluster"] == 0]["Combatpower"]
clustered_0 = pd.DataFrame({"Level":clustered_0_level, "Attendance":clustered_0_attendance, "Combatpower":clustered_0_combatpower, "Cluster":0}).reset_index(drop=True)
clustered_1_level = ml_data[ml_data["Cluster"] == 1]["Level"]
clustered_1_attendance = ml_data[ml_data["Cluster"] == 1]["Attendance"]
clustered_1_combatpower = ml_data[ml_data["Cluster"] == 1]["Combatpower"]
clustered_1 = pd.DataFrame({"Level":clustered_1_level, "Attendance":clustered_1_attendance, "Combatpower":clustered_1_combatpower, "Cluster":1}).reset_index(drop=True)
clustered_2_level = ml_data[ml_data["Cluster"] == 2]["Level"]
clustered_2_attendance = ml_data[ml_data["Cluster"] == 2]["Attendance"]
clustered_2_combatpower = ml_data[ml_data["Cluster"] == 2]["Combatpower"]
clustered_2 = pd.DataFrame({"Level":clustered_2_level, "Attendance":clustered_2_attendance, "Combatpower":clustered_2_combatpower, "Cluster":2}).reset_index(drop=True)
clustered_data = pd.concat([clustered_0, clustered_1, clustered_2], axis = 0)
center_data = pd.DataFrame({"x":centers[:, 0], "y":centers[:, 1], "z":centers[:, 2], "Cluster":[0, 1, 2]})
import plotly.express as px
import plotly.graph_objects as go
trace1 = go.Scatter3d(x = clustered_1_level, y = clustered_1_attendance, z = clustered_1_combatpower, mode = "markers", marker_opacity = 0.6, marker = dict(size = 5, color = "#EAF2F8"))
trace2 = go.Scatter3d(x = clustered_2_level, y = clustered_2_attendance, z = clustered_2_combatpower, mode = "markers", marker_opacity = 0.6, marker = dict(size = 5, color = "#EAFAF1"))
trace0 = go.Scatter3d(x = clustered_0_level, y = clustered_0_attendance, z = clustered_0_combatpower, mode = "markers", marker_opacity = 0.6, marker = dict(size = 5, color = "#F2F4F4"))
trace_center = go.Scatter3d(x = center_data["x"], y = center_data["y"], z = center_data["z"], mode = "markers", marker = dict(size = 10, color = "#CB4335"))
fig = go.Figure(data = [trace0, trace1, trace2, trace_center])
fig.show()
After Clustering
After clustering, I generated a 3D graph to review the results. However, the clustering centroids were not clearly visible. Therefore, we decided to generate additional graphs for each variable to check the centroids.
fig = plt.figure(figsize=(6, 6))
ax = fig.add_subplot(111)
ax.scatter(clustered_0_level, clustered_0_attendance, s=20, alpha=0.5)
ax.scatter(clustered_1_level, clustered_1_attendance, s=20, alpha=0.5)
ax.scatter(clustered_2_level, clustered_2_attendance, s=20, alpha=0.5)
for idx in range(len(centers)):
ax.scatter(centers[idx][0], centers[idx][1], marker='o', color = "red")
plt.show()
fig = plt.figure(figsize=(6, 6))
ax = fig.add_subplot(111)
ax.scatter(clustered_0_attendance, clustered_0_combatpower, s=20, alpha=0.5)
ax.scatter(clustered_1_attendance, clustered_1_combatpower, s=20, alpha=0.5)
ax.scatter(clustered_2_attendance, clustered_2_combatpower, s=20, alpha=0.5)
for idx in range(len(centers)):
ax.scatter(centers[idx][1], centers[idx][2], marker='o', color = "red")
plt.show()
fig = plt.figure(figsize=(6, 6))
ax = fig.add_subplot(111)
ax.scatter(clustered_0_level, clustered_0_combatpower, s=20, alpha=0.5)
ax.scatter(clustered_1_level, clustered_1_combatpower, s=20, alpha=0.5)
ax.scatter(clustered_2_level, clustered_2_combatpower, s=20, alpha=0.5)
for idx in range(len(centers)):
ax.scatter(centers[idx][0], centers[idx][2], marker='o', color = "red")
plt.show()
.
.
The importance of gameplay indicators over spending metrics
After breaking down the clustering centroids into various graphs, we found that the quarterly spending amounts were generally distributed too low to significantly impact the creation of centroids. On the other hand, login frequency and character combat power were distributed more evenly, suggesting they had a relatively greater influence on the clustering centroids.
Therefore, we plan to interpret future spending amounts as indicators closer to core gameplay (login frequency, combat power) outcomes and intend to reanalyze by giving more consideration to gameplay indicators for distinguishing users.
Additionally, we segmented game loyalty among clustered users to execute targeted events, with plans for further analysis of the results in the future.