diff --git a/analysis/age_group.py b/analysis/age_group.py index 39ba053..f9f96de 100644 --- a/analysis/age_group.py +++ b/analysis/age_group.py @@ -3,30 +3,30 @@ import os import warnings import seaborn as sns -import matplotlib -import matplotlib.pyplot as plt -import matplotlib.font_manager as fm import numpy as np from sklearn.cluster import KMeans from sklearn.metrics import pairwise_distances - -# font_location = '/home/gun/playground/techforimpact/main/_data/NanumGothic.TTF' -# font_name = fm.FontProperties(fname=font_location).get_name() -# plt.rcParams["font.family"] = font_name +from matplotlib import font_manager +from matplotlib import cm +from matplotlib import pyplot as plt # 경고 무시 warnings.filterwarnings("ignore", category=FutureWarning) BASE_DIR = os.path.join(os.path.dirname(__file__), os.pardir) -# outdir = input("_data 내의 폴더 이름은 무엇인가요?") -outdir = input() +outdir = input("_data 내의 폴더 이름은 무엇인가요?") datadir = os.path.join(BASE_DIR, "_data", outdir) -outdir = os.path.join(BASE_DIR, "output", outdir) - - +outdir = os.path.join(BASE_DIR, "output", "tmp", outdir) + +# matplotlib 한국어 폰트 설정 +font_name = font_manager.FontProperties(fname=os.path.join(BASE_DIR, "_data", "NanumSquareL.ttf")).get_name() +# import matplotlib +# print(matplotlib.matplotlib_fname()) +# print(matplotlib.get_cachedir()) +# exit(1) def most_common_age_group(df, d): - """가장 흔한 10년단위그룹이 뭔지 봅니다. + """10년단위로 무리짓고 가장 사람 많은 무리 출력. df: 데이터프레임 d: 파일 이름""" age_groups = pd.cut( @@ -60,10 +60,14 @@ def cluster(df, year, n_clusters=5, method="kmeans"): n_clusters: 그룹 수 """ os.makedirs(os.path.join(outdir, method), exist_ok=True) - youngest_age = ("", 100) - oldest_age = ("", 0) + os.makedirs(os.path.join(outdir, "all-clusters", method), exist_ok=True) + youngest_age = ('', 100) + oldest_age = ('', 0) print(f"({year}), {n_clusters} clusters") print(f"{'-' * 20}") + # Get a colormap for generating unique colors for clusters + colors = cm.rainbow(np.linspace(0, 1, n_clusters)) + # 데이터프레임에서 시도별로 묶은 후 나이 열만 가져옵니다. for sdName, df in df.groupby("sdName"): data_for_clustering = df[["age"]] @@ -136,41 +140,47 @@ def cluster(df, year, n_clusters=5, method="kmeans"): oldest_age = (sdName, cluster_centers_age[-1]) # 시각화 plt.figure(figsize=(10, 6)) - try: - sns.histplot( - data=youngest_cluster, - x="age", - kde=True, - label="Youngest Cluster", - color="blue", - element="step", - bins=range( - youngest_cluster["age"].min(), youngest_cluster["age"].max() + 1, 1 - ), - ) - except: - pass - try: - sns.histplot( - data=oldest_cluster, - x="age", - kde=True, - label="Oldest Cluster", - color="red", - element="step", - bins=range( - oldest_cluster["age"].min(), oldest_cluster["age"].max() + 1, 1 - ), - ) - except: - pass - plt.title(f"Age Distribution ({year})") + # try: + # sns.histplot(data=youngest_cluster, + # x="age", + # kde=True, + # label="Youngest Cluster", + # color="blue", + # element="step", + # bins=range(youngest_cluster['age'].min(), + # youngest_cluster['age'].max() + 1, 1)) + # except: + # pass + # try: + # sns.histplot(data=oldest_cluster, + # x="age", + # kde=True, + # label="Oldest Cluster", + # color="red", + # element="step", + # bins=range(oldest_cluster['age'].min(), + # oldest_cluster['age'].max() + 1, 1)) + # except: + # pass + for i in range(n_clusters): + cluster_data = df[df["cluster_label"] == i] + sns.histplot(data=cluster_data, + x="age", + kde=False, + label=f"Cluster {i}", + color=colors[i], + element="step", + bins=range(0, + 100, 1)) + + plt.title(f"Age Distribution ({year}), area {sdName}", fontname = font_name) plt.xlabel("Age") plt.ylabel("Frequency") plt.legend() plt.show() - plt.savefig(os.path.join(outdir, method, f"{year}-{sdName}.png")) - + plt.savefig(os.path.join(outdir, "all-clusters", method, f"{year}-{sdName}.png")) + plt.close() + print(f"Saved {year}-{sdName}.png") # # 클러스터링 결과로부터 각 데이터 포인트와 클러스터 중심 간의 거리를 계산합니다. # distances = pairwise_distances(data_for_clustering, cluster_centers) @@ -230,8 +240,7 @@ def main(): year = d[7:11] # most_common_age_group(df, year) # avg_age(df) - cluster(df, year, 7, "kmeans") - # cluster(df, year, 7, 'equal') - + cluster(df, year, 7, 'kmeans') + cluster(df, year, 7, 'equal') main()