Skip to content

Commit

Permalink
[analysis] add all_clusters(모든 그룹을 다 표시)
Browse files Browse the repository at this point in the history
  • Loading branch information
Re-st committed Nov 6, 2023
1 parent 9ddf8d3 commit efdf757
Showing 1 changed file with 58 additions and 49 deletions.
107 changes: 58 additions & 49 deletions analysis/age_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,30 +3,30 @@
import os
import warnings
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import numpy as np

from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances

# font_location = '/home/gun/playground/techforimpact/main/_data/NanumGothic.TTF'
# font_name = fm.FontProperties(fname=font_location).get_name()
# plt.rcParams["font.family"] = font_name
from matplotlib import font_manager
from matplotlib import cm
from matplotlib import pyplot as plt

# 경고 무시
warnings.filterwarnings("ignore", category=FutureWarning)

BASE_DIR = os.path.join(os.path.dirname(__file__), os.pardir)
# outdir = input("_data 내의 폴더 이름은 무엇인가요?")
outdir = input()
outdir = input("_data 내의 폴더 이름은 무엇인가요?")
datadir = os.path.join(BASE_DIR, "_data", outdir)
outdir = os.path.join(BASE_DIR, "output", outdir)


outdir = os.path.join(BASE_DIR, "output", "tmp", outdir)

# matplotlib 한국어 폰트 설정
font_name = font_manager.FontProperties(fname=os.path.join(BASE_DIR, "_data", "NanumSquareL.ttf")).get_name()
# import matplotlib
# print(matplotlib.matplotlib_fname())
# print(matplotlib.get_cachedir())
# exit(1)
def most_common_age_group(df, d):
"""가장 흔한 10년단위그룹이 뭔지 봅니다.
"""10년단위로 무리짓고 가장 사람 많은 무리 출력.
df: 데이터프레임
d: 파일 이름"""
age_groups = pd.cut(
Expand Down Expand Up @@ -60,10 +60,14 @@ def cluster(df, year, n_clusters=5, method="kmeans"):
n_clusters: 그룹 수
"""
os.makedirs(os.path.join(outdir, method), exist_ok=True)
youngest_age = ("", 100)
oldest_age = ("", 0)
os.makedirs(os.path.join(outdir, "all-clusters", method), exist_ok=True)
youngest_age = ('', 100)
oldest_age = ('', 0)
print(f"({year}), {n_clusters} clusters")
print(f"{'-' * 20}")
# Get a colormap for generating unique colors for clusters
colors = cm.rainbow(np.linspace(0, 1, n_clusters))

# 데이터프레임에서 시도별로 묶은 후 나이 열만 가져옵니다.
for sdName, df in df.groupby("sdName"):
data_for_clustering = df[["age"]]
Expand Down Expand Up @@ -136,41 +140,47 @@ def cluster(df, year, n_clusters=5, method="kmeans"):
oldest_age = (sdName, cluster_centers_age[-1])
# 시각화
plt.figure(figsize=(10, 6))
try:
sns.histplot(
data=youngest_cluster,
x="age",
kde=True,
label="Youngest Cluster",
color="blue",
element="step",
bins=range(
youngest_cluster["age"].min(), youngest_cluster["age"].max() + 1, 1
),
)
except:
pass
try:
sns.histplot(
data=oldest_cluster,
x="age",
kde=True,
label="Oldest Cluster",
color="red",
element="step",
bins=range(
oldest_cluster["age"].min(), oldest_cluster["age"].max() + 1, 1
),
)
except:
pass
plt.title(f"Age Distribution ({year})")
# try:
# sns.histplot(data=youngest_cluster,
# x="age",
# kde=True,
# label="Youngest Cluster",
# color="blue",
# element="step",
# bins=range(youngest_cluster['age'].min(),
# youngest_cluster['age'].max() + 1, 1))
# except:
# pass
# try:
# sns.histplot(data=oldest_cluster,
# x="age",
# kde=True,
# label="Oldest Cluster",
# color="red",
# element="step",
# bins=range(oldest_cluster['age'].min(),
# oldest_cluster['age'].max() + 1, 1))
# except:
# pass
for i in range(n_clusters):
cluster_data = df[df["cluster_label"] == i]
sns.histplot(data=cluster_data,
x="age",
kde=False,
label=f"Cluster {i}",
color=colors[i],
element="step",
bins=range(0,
100, 1))

plt.title(f"Age Distribution ({year}), area {sdName}", fontname = font_name)
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.legend()
plt.show()
plt.savefig(os.path.join(outdir, method, f"{year}-{sdName}.png"))

plt.savefig(os.path.join(outdir, "all-clusters", method, f"{year}-{sdName}.png"))
plt.close()
print(f"Saved {year}-{sdName}.png")
# # 클러스터링 결과로부터 각 데이터 포인트와 클러스터 중심 간의 거리를 계산합니다.
# distances = pairwise_distances(data_for_clustering, cluster_centers)

Expand Down Expand Up @@ -230,8 +240,7 @@ def main():
year = d[7:11]
# most_common_age_group(df, year)
# avg_age(df)
cluster(df, year, 7, "kmeans")
# cluster(df, year, 7, 'equal')

cluster(df, year, 7, 'kmeans')
cluster(df, year, 7, 'equal')

main()

0 comments on commit efdf757

Please sign in to comment.