import numpy  as np 
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt 
import json   

# warning 제거
import warnings
warnings.filterwarnings('ignore')

# version check 
print('numpy  version - ' , np.__version__)
print('pandas version - ' , pd.__version__)

# 데이터 정보 출력 함수 
def aryInfo(ary) : 
    print('type - ' , type(ary)) 
    print('shape - ' , ary.shape)
    print('ndim  - ' , ary.ndim)
    print('dtype - ' , ary.dtype)
    print()
    print('data  -')
    print(ary)

def seriesInfo(s) :
    print('type   - ' , type(s)) 
    print('index  - ' , s.index)
    print('values - ' , s.values)
    print('dtype  - ' , s.dtype)
    print()
    print('data   - ')
    print(s)

def frmInfo(frm) :
    print('type    - ' , type(frm))
    print('shape   - ' , frm.shape)
    print('ndim    - ' , frm.ndim)
    print('row idx - ' , frm.index , type(frm.index))
    print('col idx - ' , frm.columns , type(frm.columns))
    print('values  - ' , type(frm.values))
    print(frm.values)
    print('data - ') 
    print(frm)

numpy  version -  2.1.3
pandas version -  2.2.3

%matplotlib inline

# 한글 폰트 문제 해결
import platform
from matplotlib import font_manager, rc
plt.rcParams['axes.unicode_minus'] = False

if platform.system() == 'Darwin':
    rc('font', family='AppleGothic')
elif platform.system() == 'Windows':
    path = "c:/Windows/Fonts/malgun.ttf"
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
else:
    print('Unknown system... sorry~~~~') 


# 차트 축 <- 음수 부호 지원
import matplotlib
matplotlib.rcParams['axes.unicode_minus'] = False

plt.figure()

# plt.plot([1,2,3,4,5,6,7,8,9])
# plt.plot([1,4,9,5,6,7,2,7,9])

plt.plot([10, 30, 60, 90], [1,4,9,16], color = 'red', marker ='o', ms = 15)

plt.title('라인 플롯 - ')
plt.xlabel('x 축')
plt.ylabel('y 축', rotation = 45)

plt.xlim(0, 100)
plt.ylim(0,17)

plt.grid()
plt.show()
plt.close()

# 서브플롯을 이용해서 한 화면에 여러개의 플롯을 그릴 수 있다.

fig = plt.figure(figsize = (20,7))

area01 = fig.add_subplot(1, 3, 1)
area01.set_title('타이틀')
area01.set_xlabel('x 축')
area01.set_ylabel('y 축', rotation=0)

area02 = fig.add_subplot(1, 3, 2)
area02.set_title('타이틀')
area02.set_xlabel('x 축')
area02.set_ylabel('y 축', rotation=0)

area03 = fig.add_subplot(1, 3, 3)
area03.set_title('타이틀')
area03.set_xlabel('x 축')
area03.set_ylabel('y 축', rotation=0)

plt.show()
plt.close()

print('bar char : x 축이 범주형타입(category)')
titanicFrm = sns.load_dataset('titanic')
titanicFrm.info()

bar char : x 축이 범주형타입(category)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB

# Quiz
# 선실등급별 생존자 합을 시각화 한다면?
# titanicFrm.groupby('pclass')['survived'].groups
# titanicFrm.groupby('pclass')['survived'].sum().index
titanicFrm.groupby('pclass')['survived'].sum().values

array([136,  87, 119])

plt.figure(figsize = (15,5))

plt.bar(titanicFrm.groupby('pclass')['survived'].sum().index, 
        titanicFrm.groupby('pclass')['survived'].sum().values)

plt.xticks(titanicFrm.groupby('pclass')['survived'].sum().index)

plt.title('선실 등급별 생존자 - ')
plt.xlabel('선실등급')
plt.ylabel('선실 등급별 생존자', rotation = 45)

plt.show()
plt.close()

# 간단한 시각화를 위해서 더미 데이터 세트를 만들어 보자
# 로그인 로그 데이터(timestamp, user, ip, status, delay_ms)

# timestamp : pd.date_range('2025-11-06', periods=100, freq='H')
# user : np.random.choice(['admin', 'superAdmin', 'root', 'guest', 'analyst'], 100)
# ip : np.random.choice(['192.168.0.1', '192.168.0.3', '192.168.0.5', '192.168.0.7', '192.168.0.9'], 100)
# status : np.random.choice(['success', 'fail'], 100, p=[0.6, 0.4])
# delay_ms : np.random.randint(20, 800, 100)

frm = pd.DataFrame({
    "timestamp" : pd.date_range('2025-11-06', periods=100, freq='H'),
    "user" : np.random.choice(['admin', 'superAdmin', 'root', 'guest', 'analyst'], 100),
    "ip" : np.random.choice(['192.168.0.1', '192.168.0.3', '192.168.0.5', '192.168.0.7', '192.168.0.9'], 100),
    "status" : np.random.choice(['success', 'fail'], 100, p=[0.6, 0.4]),
    "delay_ms" : np.random.randint(20, 800, 100)
})
frm.head()

# Quiz
# 로그인 시도 상태별 횟수를 bar plot 이용하여 시각화

plt.figure(figsize = (15,5))

plt.bar(frm['status'].value_counts().index, 
        frm['status'].value_counts().values,
        color = ['green', 'red'])

plt.xticks(frm['status'].value_counts().index)

plt.title('로그인 시도 상태 - ')
plt.xlabel('상태')
plt.ylabel('시도횟수', rotation = 45)

plt.show()
plt.close()

# frm.groupby('status').count()
# frm['status'].value_counts()

# Quiz
# 시간대별 평균 지연시간을 line plot 시각화 -
# type(frm['timestamp'])
hour = frm['timestamp'].dt.hour

plt.figure(figsize=(15,5))

plt.plot(frm.groupby(frm['timestamp'].dt.hour)['delay_ms'].mean().index,
          frm.groupby(frm['timestamp'].dt.hour)['delay_ms'].mean().values)

plt.xticks(frm.groupby(frm['timestamp'].dt.hour)['delay_ms'].mean().index)

plt.title('시간대별 평균 지연시간')
plt.xlabel('시간대')
plt.ylabel('지연시간',rotation = 0)

plt.show()
plt.close()

irisFrm = sns.load_dataset('iris')
irisFrm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB

irisFrm.head()

# Quiz
# 품종을 기준으로 그룹화 후 막대그래프로 시각화
speciesFrm = irisFrm.groupby('species').mean()

plt.figure()

#speciesFrm.plot(kind='bar')
speciesFrm.T.plot(kind='bar')

plt.legend(loc = 'best')
plt.xticks(rotation = 0)
plt.show()
plt.close()

<Figure size 640x480 with 0 Axes>

# histogram
# 연속형 데이터의 분포(distribution), 일정한 구간(bin)
# 해당 구간에 포함되는 데이터의 개수를 세어서 막대형태로 표현
# Quiz 로그인 지연 분포 확인


plt.figure(figsize=(15,5))

plt.hist(frm['delay_ms'], bins=20)

plt.title('로그인 지연 분포')
plt.xlabel('delay(ms)')
plt.ylabel('Freq',rotation = 0)

plt.show()
plt.close()

# countplot
# 사용자별 로그인 시도 패턴

plt.figure(figsize = (15,5))

sns.countplot(x = 'user',hue = 'status', data = frm, palette = 'coolwarm')

plt.show()
plt.close()

# box plot : 이상치(outlier) 탐지를 위한 시각화 도구
# 데이터의 중심(median), 퍼짐(사분위수), 이상치(outlier)를 한눈에 보여줌
# Q1(25%), Q2(50%), Q3(75%)
# IQR(Inner Quartile Range) : Q3 - Q1
# lower bound = Q1 - 1.5 * IQR , upper bound : Q3 + 1.5 * IQR
# 판정기준 값 < lower bound : 하한 이상치 , 값 > upper bound : 상한 이상치
# whisker(수염) : IQR 1.5배 범위 내 데이터

boxFrm = pd.DataFrame({
    
    "user" : np.random.choice(['admin', 'root', 'guest'], 100),
    "delay_ms" : np.concatenate([
                    np.random.normal(200, 50, 80),
                    np.random.normal(800, 20, 10),
                    np.random.normal(100, 20, 10)
                ])
})
boxFrm.head()

# 정규분포 더미 데이터
# np.random.normal(200, 50, 80)
# np.random.normal(800, 20, 10)
# np.random.normal(100, 20, 10)

np.concatenate([
    np.random.normal(200, 50, 80),
    np.random.normal(800, 20, 10),
    np.random.normal(100, 20, 10)
])

array([292.51640763, 168.57420747, 290.69907913, 244.46286972,
       177.34253952, 217.46538652, 129.25760432, 205.74352218,
       248.47891827, 268.54624142, 173.97877908, 210.92703789,
       141.63760471, 160.8589941 , 199.30024495, 267.19403124,
       135.90760527, 242.63479587, 209.26242116, 263.14823025,
       200.09712326, 176.7684967 ,  99.91410234, 271.96073546,
       284.21562685,  93.86541221, 149.01866832, 204.04893058,
        84.11781713, 144.5350304 , 181.48100725, 236.14604557,
       129.17758147, 150.0605812 , 184.9505203 , 238.19389845,
       206.05383098, 188.15616798, 198.81813538, 178.78378535,
       254.04604323, 175.23122605, 221.80939747, 285.99845825,
       279.03247656, 272.61639641, 144.92982197, 219.8355202 ,
       125.3305636 , 183.84997572, 226.30474703, 224.62590732,
       193.49746878, 258.78868177, 152.21134589, 173.43409163,
       189.21475311, 190.51262847, 298.0115558 , 220.34321991,
       178.0259904 , 238.45438115, 234.10971496,  84.28255466,
       156.5164555 , 185.31665669, 139.70191482, 169.02536862,
       125.87790124, 226.99320902, 245.44084644, 197.65665821,
       224.25296061, 197.56926419, 304.57799871, 272.67420392,
       295.74316342, 230.25838562, 268.21286954, 177.63697886,
       795.61537162, 837.26440743, 783.02674952, 764.35211196,
       797.14154998, 800.56453445, 795.74691275, 797.2859874 ,
       791.94023827, 765.83006055,  99.58661875, 129.57059035,
       125.54431818,  53.75850928, 105.25341529,  88.87525094,
       146.77941659,  98.99170167,  72.98629229,  89.4745997 ])

boxFrm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   user      100 non-null    object 
 1   delay_ms  100 non-null    float64
dtypes: float64(1), object(1)
memory usage: 1.7+ KB

boxFrm['delay_ms'].describe()

count    100.000000
mean     256.293914
std      196.997663
min       52.234947
25%      159.336635
50%      201.314419
75%      252.315077
max      864.892806
Name: delay_ms, dtype: float64

# IQR
Q1 = boxFrm['delay_ms'].quantile(0.25)
print('Q1 - ', Q1)
Q3 = boxFrm['delay_ms'].quantile(0.75)
print('Q3 - ', Q3)
IQR = Q3 - Q1
print('IQR - ', IQR)

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
print(f'lower {lower_bound} , upper {upper_bound}')

Q1 -  159.33663457474654
Q3 -  252.31507680778475
IQR -  92.97844223303821
lower 19.868971225189227 , upper 391.78274015734206

print('이상치 탐지 - ')
outliers = boxFrm[ (boxFrm['delay_ms'] < lower_bound) | (boxFrm['delay_ms'] > upper_bound) ]
print(outliers)

이상치 탐지 - 
     user    delay_ms
80  admin  864.892806
81   root  821.593476
82  admin  839.315056
83   root  789.554091
84  admin  804.741641
85  admin  777.361475
86  admin  823.628389
87  admin  830.470144
88  guest  790.787516
89  admin  828.703043

# box plot : 이상치(outlier) 탐지를 위한 시각화 도구
# 로그인 지연시간 이상치 탐지

plt.figure(figsize = (15, 5))

sns.boxplot(x = 'delay_ms', data = boxFrm, color = 'gray')
sns.stripplot(x = 'delay_ms', data = boxFrm, color = 'red', jitter = True, alpha = 0.5)

plt.show()
plt.close()

# 산점도(scatter plot)
# 두 개의 연속형 변수 간의 관계를 시각화
# x : 독립변수(feature), y : 종속변수(target)
# 퍼짐의 정도
# 점들이 어떤 패턴(선형, 곡선, 군집)을 이루는지 보면서 변수간의 관계를 파악하기 위한 시각화

plt.figure()

x = [1,2,3,4,5,6,7,8,9]
y = [1,4,9,5,6,7,2,7,9]

plt.scatter(x,y, color = 'red', s = 5, alpha = 0.7, marker = 'o')

plt.grid(False)
plt.show()
plt.close()

# Quiz
# 사용자별 로그인 시도 패턴을 산점도로 시각화하고 싶다 (시도횟수)
# 각 점은 : 사용자
# x : 평균 로그인 지연시간
# y : 실패율(failRatio)

# insight : 비정상적인 사용자 행동 패턴을 탐지할 수 있다.

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001CBA2687890>

scatterFrm = pd.DataFrame({
    "timestamp" : pd.date_range('2025-11-06', periods=100, freq='H'),
    "user" : np.random.choice(['admin', 'superAdmin', 'root', 'guest', 'analyst'], 100),
    "ip" : np.random.choice(['192.168.0.1', '192.168.0.3', '192.168.0.5', '192.168.0.7', '192.168.0.9'], 100),
    "status" : np.random.choice(['success', 'fail'], 100, p=[0.6, 0.4]),
    "delay_ms" : np.random.randint(20, 800, 100)
})
scatterFrm.head()

# plt.figure()
# x = scatterFrm.groupby('user')['delay_ms'].mean()
# y = scatterFrm.groupby('user')['status'].apply(lambda x : x.value_counts()['fail']/x.value_counts().sum())
# plt.scatter(x, y, color='red', s=5, marker='o')
# plt.show()
# plt.close()

avg = scatterFrm.groupby('user')['delay_ms'].mean()
# print(x)
failRatio = scatterFrm.groupby('user')['status'].apply(lambda x : (x=='fail').mean())
# print(y)
attempts = scatterFrm['user'].value_counts()
# print(scatterFrm['user'].value_counts())
userStatus = pd.DataFrame({
    'avg' : avg,
    'failRatio' : failRatio,
    'attempts' : attempts
});

plt.figure(figsize = (15,5))

sns.scatterplot(x='avg',
                y='failRatio',
                data = userStatus,
                size = 'attempts',
                hue='user')

plt.show()
plt.close()

# heatmap
corr = irisFrm.corr(numeric_only=True)
print(corr)

plt.figure(figsize=(15,5))

sns.heatmap(corr, fmt='.2f', annot=True, linewidth=0.5)
plt.show()
plt.close()

              sepal_length  sepal_width  petal_length  petal_width
sepal_length      1.000000    -0.117570      0.871754     0.817941
sepal_width      -0.117570     1.000000     -0.428440    -0.366126
petal_length      0.871754    -0.428440      1.000000     0.962865
petal_width       0.817941    -0.366126      0.962865     1.000000

# frm 데이터를 이용해서 히트맵 시각화
# Quiz
# 사용자-상태별 평균 지연시간

pivot = frm.pivot_table(index='user', columns='status', values='delay_ms', aggfunc='mean')
# print(pivot)

plt.figure(figsize=(15,5))

sns.heatmap(pivot, fmt='.2f', annot=True, linewidth=0.5)
plt.show()
plt.close()

# Quiz

mpgFrm = pd.read_excel('./data/mpg_visualization.xlsx',
                      index_col = 0)
mpgFrm.head()

# print('Q1) 배기량(displ)에 따른 고속연비를 확인하고 한다')
# print('배기량 4 이하인 자동차와 5이상인 자동차 중 고속도로 평균연비가 높은지를 확인한다면')

avg = mpgFrm.groupby(mpgFrm['displ'] >= 5)['hwy'].mean()
#print(avg)

plt.figure(figsize=(15,5))

avg.index = ['5미만', '5이상']

plt.bar(avg.index,
           avg.values)

plt.show()
plt.close()

# print('Q2) 자동차 제조사에 따른 도시 연비를 비교할려고 한다')
# print('audi , toyota 두 회사의 모든 차종에 대한 도시연비 평균을 비교 - ') 


audi = mpgFrm[mpgFrm['manufacturer'] == 'audi']['cty'].mean()
toyota = mpgFrm[mpgFrm['manufacturer'] == 'toyota']['cty'].mean()

avg = pd.Series([audi, toyota], index=['audi', 'toyota'])

plt.figure(figsize=(15,5))
plt.bar(avg.index, avg.values)

plt.show()
plt.close()

# print('Q3) chevrolet, ford, honda 제조사의 모든 차종에 대한 고속도로 연비 평균을 시각화')

chevrolet = mpgFrm[mpgFrm['manufacturer'] == 'chevrolet']['hwy'].mean()
ford = mpgFrm[mpgFrm['manufacturer'] == 'ford']['hwy'].mean()
honda = mpgFrm[mpgFrm['manufacturer'] == 'honda']['hwy'].mean()

avg = pd.Series([chevrolet, ford, honda], index=['chevrolet', 'ford', 'honda'])

plt.figure(figsize=(15,5))
plt.bar(avg.index, avg.values)

plt.show()
plt.close()

# print('Q4)구동방식별 고속도로연비평균을 막대 그래프로 시각화 - ')


trans = mpgFrm.groupby('trans')['hwy'].mean()
print(trans)

plt.figure(figsize=(15,5))
plt.bar(trans.index, trans.values)

plt.show()
plt.close()

trans
auto(av)      27.800000
auto(l3)      27.000000
auto(l4)      21.963855
auto(l5)      20.717949
auto(l6)      20.000000
auto(s4)      25.666667
auto(s5)      25.333333
auto(s6)      25.187500
manual(m5)    26.293103
manual(m6)    24.210526
Name: hwy, dtype: float64

# print('Q5) 구동방식별 고속도로, 도시연비 평균을 서브셋을 만들고')
# print('시각화 - multi bar ') 

avg = mpgFrm.groupby('drv')[['cty', 'hwy']].mean()
print(avg)

avg.plot(kind='bar', figsize=(15,5))

plt.show()
plt.close()

           cty        hwy
drv                      
4    14.330097  19.174757
f    19.971698  28.160377
r    14.080000  21.000000

# print('Q6) 해당 클래스별 빈도수를 시각화 - ')

class_count = mpgFrm['class'].value_counts()
print(class_count)

plt.figure(figsize=(8,5))
class_count.plot(kind='bar')
plt.title('자동차 클래스별 빈도수')
plt.xlabel('차종 (class)')
plt.ylabel('빈도수 (count)')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()
plt.close()

class
suv           62
compact       47
midsize       41
subcompact    35
pickup        33
minivan       11
2seater        5
Name: count, dtype: int64

	timestamp	user	ip	status	delay_ms
0	2025-11-06 00:00:00	root	192.168.0.3	fail	257
1	2025-11-06 01:00:00	admin	192.168.0.7	fail	688
2	2025-11-06 02:00:00	analyst	192.168.0.3	success	559
3	2025-11-06 03:00:00	analyst	192.168.0.3	success	688
4	2025-11-06 04:00:00	guest	192.168.0.3	fail	459

	sepal_length	sepal_width	petal_length	petal_width	species
0	5.1	3.5	1.4	0.2	setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa

	user	delay_ms
0	root	160.264908
1	root	210.451747
2	root	134.348377
3	guest	185.193624
4	guest	255.948266

	timestamp	user	ip	status	delay_ms
0	2025-11-06 00:00:00	guest	192.168.0.9	fail	726
1	2025-11-06 01:00:00	superAdmin	192.168.0.3	success	798
2	2025-11-06 02:00:00	admin	192.168.0.3	success	585
3	2025-11-06 03:00:00	analyst	192.168.0.5	success	97
4	2025-11-06 04:00:00	admin	192.168.0.1	success	756

	manufacturer	model	displ	year	cyl	trans	drv	cty	hwy	fl	class
1	audi	a4	1.8	1999	4	auto(l5)	f	18	29	p	compact
2	audi	a4	1.8	1999	4	manual(m5)	f	21	29	p	compact
3	audi	a4	2.0	2008	4	manual(m6)	f	20	31	p	compact
4	audi	a4	2.0	2008	4	auto(av)	f	21	30	p	compact
5	audi	a4	2.8	1999	6	auto(l5)	f	16	26	p	compact

학습목표¶