학습목표¶

  • 시각화 패키지 matplotlib, seaborn, folium
  • 서브 패키지 pyplot, plotly
  • 웹 시각화 streamlit
In [1]:
import numpy  as np 
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt 
import json   

# warning 제거
import warnings
warnings.filterwarnings('ignore')

# version check 
print('numpy  version - ' , np.__version__)
print('pandas version - ' , pd.__version__)

# 데이터 정보 출력 함수 
def aryInfo(ary) : 
    print('type - ' , type(ary)) 
    print('shape - ' , ary.shape)
    print('ndim  - ' , ary.ndim)
    print('dtype - ' , ary.dtype)
    print()
    print('data  -')
    print(ary)

def seriesInfo(s) :
    print('type   - ' , type(s)) 
    print('index  - ' , s.index)
    print('values - ' , s.values)
    print('dtype  - ' , s.dtype)
    print()
    print('data   - ')
    print(s)

def frmInfo(frm) :
    print('type    - ' , type(frm))
    print('shape   - ' , frm.shape)
    print('ndim    - ' , frm.ndim)
    print('row idx - ' , frm.index , type(frm.index))
    print('col idx - ' , frm.columns , type(frm.columns))
    print('values  - ' , type(frm.values))
    print(frm.values)
    print('data - ') 
    print(frm)
numpy  version -  2.1.3
pandas version -  2.2.3
In [2]:
%matplotlib inline

# 한글 폰트 문제 해결
import platform
from matplotlib import font_manager, rc
plt.rcParams['axes.unicode_minus'] = False

if platform.system() == 'Darwin':
    rc('font', family='AppleGothic')
elif platform.system() == 'Windows':
    path = "c:/Windows/Fonts/malgun.ttf"
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
else:
    print('Unknown system... sorry~~~~') 


# 차트 축 <- 음수 부호 지원
import matplotlib
matplotlib.rcParams['axes.unicode_minus'] = False
In [11]:
plt.figure()

# plt.plot([1,2,3,4,5,6,7,8,9])
# plt.plot([1,4,9,5,6,7,2,7,9])

plt.plot([10, 30, 60, 90], [1,4,9,16], color = 'red', marker ='o', ms = 15)

plt.title('라인 플롯 - ')
plt.xlabel('x 축')
plt.ylabel('y 축', rotation = 45)

plt.xlim(0, 100)
plt.ylim(0,17)

plt.grid()
plt.show()
plt.close()
No description has been provided for this image
In [19]:
# 서브플롯을 이용해서 한 화면에 여러개의 플롯을 그릴 수 있다.

fig = plt.figure(figsize = (20,7))

area01 = fig.add_subplot(1, 3, 1)
area01.set_title('타이틀')
area01.set_xlabel('x 축')
area01.set_ylabel('y 축', rotation=0)

area02 = fig.add_subplot(1, 3, 2)
area02.set_title('타이틀')
area02.set_xlabel('x 축')
area02.set_ylabel('y 축', rotation=0)

area03 = fig.add_subplot(1, 3, 3)
area03.set_title('타이틀')
area03.set_xlabel('x 축')
area03.set_ylabel('y 축', rotation=0)

plt.show()
plt.close()
No description has been provided for this image
In [20]:
print('bar char : x 축이 범주형타입(category)')
titanicFrm = sns.load_dataset('titanic')
titanicFrm.info()
bar char : x 축이 범주형타입(category)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB
In [33]:
# Quiz
# 선실등급별 생존자 합을 시각화 한다면?
# titanicFrm.groupby('pclass')['survived'].groups
# titanicFrm.groupby('pclass')['survived'].sum().index
titanicFrm.groupby('pclass')['survived'].sum().values
Out[33]:
array([136,  87, 119])
In [41]:
plt.figure(figsize = (15,5))

plt.bar(titanicFrm.groupby('pclass')['survived'].sum().index, 
        titanicFrm.groupby('pclass')['survived'].sum().values)

plt.xticks(titanicFrm.groupby('pclass')['survived'].sum().index)

plt.title('선실 등급별 생존자 - ')
plt.xlabel('선실등급')
plt.ylabel('선실 등급별 생존자', rotation = 45)

plt.show()
plt.close()
No description has been provided for this image
In [42]:
# 간단한 시각화를 위해서 더미 데이터 세트를 만들어 보자
# 로그인 로그 데이터(timestamp, user, ip, status, delay_ms)
In [50]:
# timestamp : pd.date_range('2025-11-06', periods=100, freq='H')
# user : np.random.choice(['admin', 'superAdmin', 'root', 'guest', 'analyst'], 100)
# ip : np.random.choice(['192.168.0.1', '192.168.0.3', '192.168.0.5', '192.168.0.7', '192.168.0.9'], 100)
# status : np.random.choice(['success', 'fail'], 100, p=[0.6, 0.4])
# delay_ms : np.random.randint(20, 800, 100)
In [186]:
frm = pd.DataFrame({
    "timestamp" : pd.date_range('2025-11-06', periods=100, freq='H'),
    "user" : np.random.choice(['admin', 'superAdmin', 'root', 'guest', 'analyst'], 100),
    "ip" : np.random.choice(['192.168.0.1', '192.168.0.3', '192.168.0.5', '192.168.0.7', '192.168.0.9'], 100),
    "status" : np.random.choice(['success', 'fail'], 100, p=[0.6, 0.4]),
    "delay_ms" : np.random.randint(20, 800, 100)
})
frm.head()
Out[186]:
timestamp user ip status delay_ms
0 2025-11-06 00:00:00 root 192.168.0.3 fail 257
1 2025-11-06 01:00:00 admin 192.168.0.7 fail 688
2 2025-11-06 02:00:00 analyst 192.168.0.3 success 559
3 2025-11-06 03:00:00 analyst 192.168.0.3 success 688
4 2025-11-06 04:00:00 guest 192.168.0.3 fail 459
In [74]:
# Quiz
# 로그인 시도 상태별 횟수를 bar plot 이용하여 시각화

plt.figure(figsize = (15,5))

plt.bar(frm['status'].value_counts().index, 
        frm['status'].value_counts().values,
        color = ['green', 'red'])

plt.xticks(frm['status'].value_counts().index)

plt.title('로그인 시도 상태 - ')
plt.xlabel('상태')
plt.ylabel('시도횟수', rotation = 45)

plt.show()
plt.close()

# frm.groupby('status').count()
# frm['status'].value_counts()
No description has been provided for this image
In [81]:
# Quiz
# 시간대별 평균 지연시간을 line plot 시각화 -
# type(frm['timestamp'])
hour = frm['timestamp'].dt.hour

plt.figure(figsize=(15,5))

plt.plot(frm.groupby(frm['timestamp'].dt.hour)['delay_ms'].mean().index,
          frm.groupby(frm['timestamp'].dt.hour)['delay_ms'].mean().values)

plt.xticks(frm.groupby(frm['timestamp'].dt.hour)['delay_ms'].mean().index)

plt.title('시간대별 평균 지연시간')
plt.xlabel('시간대')
plt.ylabel('지연시간',rotation = 0)

plt.show()
plt.close()
No description has been provided for this image
In [82]:
irisFrm = sns.load_dataset('iris')
irisFrm.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB
In [83]:
irisFrm.head()
Out[83]:
sepal_length sepal_width petal_length petal_width species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
In [100]:
# Quiz
# 품종을 기준으로 그룹화 후 막대그래프로 시각화
speciesFrm = irisFrm.groupby('species').mean()

plt.figure()

#speciesFrm.plot(kind='bar')
speciesFrm.T.plot(kind='bar')

plt.legend(loc = 'best')
plt.xticks(rotation = 0)
plt.show()
plt.close()
<Figure size 640x480 with 0 Axes>
No description has been provided for this image
In [104]:
# histogram
# 연속형 데이터의 분포(distribution), 일정한 구간(bin)
# 해당 구간에 포함되는 데이터의 개수를 세어서 막대형태로 표현
# Quiz 로그인 지연 분포 확인


plt.figure(figsize=(15,5))

plt.hist(frm['delay_ms'], bins=20)

plt.title('로그인 지연 분포')
plt.xlabel('delay(ms)')
plt.ylabel('Freq',rotation = 0)

plt.show()
plt.close()
No description has been provided for this image
In [108]:
# countplot
# 사용자별 로그인 시도 패턴

plt.figure(figsize = (15,5))

sns.countplot(x = 'user',hue = 'status', data = frm, palette = 'coolwarm')

plt.show()
plt.close()
No description has been provided for this image
In [147]:
# box plot : 이상치(outlier) 탐지를 위한 시각화 도구
# 데이터의 중심(median), 퍼짐(사분위수), 이상치(outlier)를 한눈에 보여줌
# Q1(25%), Q2(50%), Q3(75%)
# IQR(Inner Quartile Range) : Q3 - Q1
# lower bound = Q1 - 1.5 * IQR , upper bound : Q3 + 1.5 * IQR
# 판정기준 값 < lower bound : 하한 이상치 , 값 > upper bound : 상한 이상치
# whisker(수염) : IQR 1.5배 범위 내 데이터
In [173]:
boxFrm = pd.DataFrame({
    
    "user" : np.random.choice(['admin', 'root', 'guest'], 100),
    "delay_ms" : np.concatenate([
                    np.random.normal(200, 50, 80),
                    np.random.normal(800, 20, 10),
                    np.random.normal(100, 20, 10)
                ])
})
boxFrm.head()
Out[173]:
user delay_ms
0 root 160.264908
1 root 210.451747
2 root 134.348377
3 guest 185.193624
4 guest 255.948266
In [155]:
# 정규분포 더미 데이터
# np.random.normal(200, 50, 80)
# np.random.normal(800, 20, 10)
# np.random.normal(100, 20, 10)

np.concatenate([
    np.random.normal(200, 50, 80),
    np.random.normal(800, 20, 10),
    np.random.normal(100, 20, 10)
])
Out[155]:
array([292.51640763, 168.57420747, 290.69907913, 244.46286972,
       177.34253952, 217.46538652, 129.25760432, 205.74352218,
       248.47891827, 268.54624142, 173.97877908, 210.92703789,
       141.63760471, 160.8589941 , 199.30024495, 267.19403124,
       135.90760527, 242.63479587, 209.26242116, 263.14823025,
       200.09712326, 176.7684967 ,  99.91410234, 271.96073546,
       284.21562685,  93.86541221, 149.01866832, 204.04893058,
        84.11781713, 144.5350304 , 181.48100725, 236.14604557,
       129.17758147, 150.0605812 , 184.9505203 , 238.19389845,
       206.05383098, 188.15616798, 198.81813538, 178.78378535,
       254.04604323, 175.23122605, 221.80939747, 285.99845825,
       279.03247656, 272.61639641, 144.92982197, 219.8355202 ,
       125.3305636 , 183.84997572, 226.30474703, 224.62590732,
       193.49746878, 258.78868177, 152.21134589, 173.43409163,
       189.21475311, 190.51262847, 298.0115558 , 220.34321991,
       178.0259904 , 238.45438115, 234.10971496,  84.28255466,
       156.5164555 , 185.31665669, 139.70191482, 169.02536862,
       125.87790124, 226.99320902, 245.44084644, 197.65665821,
       224.25296061, 197.56926419, 304.57799871, 272.67420392,
       295.74316342, 230.25838562, 268.21286954, 177.63697886,
       795.61537162, 837.26440743, 783.02674952, 764.35211196,
       797.14154998, 800.56453445, 795.74691275, 797.2859874 ,
       791.94023827, 765.83006055,  99.58661875, 129.57059035,
       125.54431818,  53.75850928, 105.25341529,  88.87525094,
       146.77941659,  98.99170167,  72.98629229,  89.4745997 ])
In [174]:
boxFrm.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   user      100 non-null    object 
 1   delay_ms  100 non-null    float64
dtypes: float64(1), object(1)
memory usage: 1.7+ KB
In [175]:
boxFrm['delay_ms'].describe()
Out[175]:
count    100.000000
mean     256.293914
std      196.997663
min       52.234947
25%      159.336635
50%      201.314419
75%      252.315077
max      864.892806
Name: delay_ms, dtype: float64
In [176]:
# IQR
Q1 = boxFrm['delay_ms'].quantile(0.25)
print('Q1 - ', Q1)
Q3 = boxFrm['delay_ms'].quantile(0.75)
print('Q3 - ', Q3)
IQR = Q3 - Q1
print('IQR - ', IQR)

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
print(f'lower {lower_bound} , upper {upper_bound}')
Q1 -  159.33663457474654
Q3 -  252.31507680778475
IQR -  92.97844223303821
lower 19.868971225189227 , upper 391.78274015734206
In [177]:
print('이상치 탐지 - ')
outliers = boxFrm[ (boxFrm['delay_ms'] < lower_bound) | (boxFrm['delay_ms'] > upper_bound) ]
print(outliers)
이상치 탐지 - 
     user    delay_ms
80  admin  864.892806
81   root  821.593476
82  admin  839.315056
83   root  789.554091
84  admin  804.741641
85  admin  777.361475
86  admin  823.628389
87  admin  830.470144
88  guest  790.787516
89  admin  828.703043
In [178]:
# box plot : 이상치(outlier) 탐지를 위한 시각화 도구
# 로그인 지연시간 이상치 탐지

plt.figure(figsize = (15, 5))

sns.boxplot(x = 'delay_ms', data = boxFrm, color = 'gray')
sns.stripplot(x = 'delay_ms', data = boxFrm, color = 'red', jitter = True, alpha = 0.5)

plt.show()
plt.close()
No description has been provided for this image
In [179]:
# 산점도(scatter plot)
# 두 개의 연속형 변수 간의 관계를 시각화
# x : 독립변수(feature), y : 종속변수(target)
# 퍼짐의 정도
# 점들이 어떤 패턴(선형, 곡선, 군집)을 이루는지 보면서 변수간의 관계를 파악하기 위한 시각화
In [185]:
plt.figure()

x = [1,2,3,4,5,6,7,8,9]
y = [1,4,9,5,6,7,2,7,9]

plt.scatter(x,y, color = 'red', s = 5, alpha = 0.7, marker = 'o')

plt.grid(False)
plt.show()
plt.close()
No description has been provided for this image
In [189]:
# Quiz
# 사용자별 로그인 시도 패턴을 산점도로 시각화하고 싶다 (시도횟수)
# 각 점은 : 사용자
# x : 평균 로그인 지연시간
# y : 실패율(failRatio)

# insight : 비정상적인 사용자 행동 패턴을 탐지할 수 있다.
Out[189]:
<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001CBA2687890>
In [199]:
scatterFrm = pd.DataFrame({
    "timestamp" : pd.date_range('2025-11-06', periods=100, freq='H'),
    "user" : np.random.choice(['admin', 'superAdmin', 'root', 'guest', 'analyst'], 100),
    "ip" : np.random.choice(['192.168.0.1', '192.168.0.3', '192.168.0.5', '192.168.0.7', '192.168.0.9'], 100),
    "status" : np.random.choice(['success', 'fail'], 100, p=[0.6, 0.4]),
    "delay_ms" : np.random.randint(20, 800, 100)
})
scatterFrm.head()
Out[199]:
timestamp user ip status delay_ms
0 2025-11-06 00:00:00 guest 192.168.0.9 fail 726
1 2025-11-06 01:00:00 superAdmin 192.168.0.3 success 798
2 2025-11-06 02:00:00 admin 192.168.0.3 success 585
3 2025-11-06 03:00:00 analyst 192.168.0.5 success 97
4 2025-11-06 04:00:00 admin 192.168.0.1 success 756
In [220]:
# plt.figure()
# x = scatterFrm.groupby('user')['delay_ms'].mean()
# y = scatterFrm.groupby('user')['status'].apply(lambda x : x.value_counts()['fail']/x.value_counts().sum())
# plt.scatter(x, y, color='red', s=5, marker='o')
# plt.show()
# plt.close()

avg = scatterFrm.groupby('user')['delay_ms'].mean()
# print(x)
failRatio = scatterFrm.groupby('user')['status'].apply(lambda x : (x=='fail').mean())
# print(y)
attempts = scatterFrm['user'].value_counts()
# print(scatterFrm['user'].value_counts())
userStatus = pd.DataFrame({
    'avg' : avg,
    'failRatio' : failRatio,
    'attempts' : attempts
});

plt.figure(figsize = (15,5))

sns.scatterplot(x='avg',
                y='failRatio',
                data = userStatus,
                size = 'attempts',
                hue='user')

plt.show()
plt.close()
No description has been provided for this image
In [232]:
# heatmap
corr = irisFrm.corr(numeric_only=True)
print(corr)

plt.figure(figsize=(15,5))

sns.heatmap(corr, fmt='.2f', annot=True, linewidth=0.5)
plt.show()
plt.close()
              sepal_length  sepal_width  petal_length  petal_width
sepal_length      1.000000    -0.117570      0.871754     0.817941
sepal_width      -0.117570     1.000000     -0.428440    -0.366126
petal_length      0.871754    -0.428440      1.000000     0.962865
petal_width       0.817941    -0.366126      0.962865     1.000000
No description has been provided for this image
In [242]:
# frm 데이터를 이용해서 히트맵 시각화
# Quiz
# 사용자-상태별 평균 지연시간

pivot = frm.pivot_table(index='user', columns='status', values='delay_ms', aggfunc='mean')
# print(pivot)

plt.figure(figsize=(15,5))

sns.heatmap(pivot, fmt='.2f', annot=True, linewidth=0.5)
plt.show()
plt.close()
No description has been provided for this image
In [243]:
# Quiz
In [246]:
mpgFrm = pd.read_excel('./data/mpg_visualization.xlsx',
                      index_col = 0)
mpgFrm.head()
Out[246]:
manufacturer model displ year cyl trans drv cty hwy fl class
1 audi a4 1.8 1999 4 auto(l5) f 18 29 p compact
2 audi a4 1.8 1999 4 manual(m5) f 21 29 p compact
3 audi a4 2.0 2008 4 manual(m6) f 20 31 p compact
4 audi a4 2.0 2008 4 auto(av) f 21 30 p compact
5 audi a4 2.8 1999 6 auto(l5) f 16 26 p compact
In [261]:
# print('Q1) 배기량(displ)에 따른 고속연비를 확인하고 한다')
# print('배기량 4 이하인 자동차와 5이상인 자동차 중 고속도로 평균연비가 높은지를 확인한다면')

avg = mpgFrm.groupby(mpgFrm['displ'] >= 5)['hwy'].mean()
#print(avg)

plt.figure(figsize=(15,5))

avg.index = ['5미만', '5이상']

plt.bar(avg.index,
           avg.values)

plt.show()
plt.close()
No description has been provided for this image
In [265]:
# print('Q2) 자동차 제조사에 따른 도시 연비를 비교할려고 한다')
# print('audi , toyota 두 회사의 모든 차종에 대한 도시연비 평균을 비교 - ') 


audi = mpgFrm[mpgFrm['manufacturer'] == 'audi']['cty'].mean()
toyota = mpgFrm[mpgFrm['manufacturer'] == 'toyota']['cty'].mean()

avg = pd.Series([audi, toyota], index=['audi', 'toyota'])

plt.figure(figsize=(15,5))
plt.bar(avg.index, avg.values)

plt.show()
plt.close()
No description has been provided for this image
In [266]:
# print('Q3) chevrolet, ford, honda 제조사의 모든 차종에 대한 고속도로 연비 평균을 시각화')

chevrolet = mpgFrm[mpgFrm['manufacturer'] == 'chevrolet']['hwy'].mean()
ford = mpgFrm[mpgFrm['manufacturer'] == 'ford']['hwy'].mean()
honda = mpgFrm[mpgFrm['manufacturer'] == 'honda']['hwy'].mean()

avg = pd.Series([chevrolet, ford, honda], index=['chevrolet', 'ford', 'honda'])

plt.figure(figsize=(15,5))
plt.bar(avg.index, avg.values)

plt.show()
plt.close()
No description has been provided for this image
In [272]:
# print('Q4)구동방식별 고속도로연비평균을 막대 그래프로 시각화 - ')


trans = mpgFrm.groupby('trans')['hwy'].mean()
print(trans)

plt.figure(figsize=(15,5))
plt.bar(trans.index, trans.values)

plt.show()
plt.close()
trans
auto(av)      27.800000
auto(l3)      27.000000
auto(l4)      21.963855
auto(l5)      20.717949
auto(l6)      20.000000
auto(s4)      25.666667
auto(s5)      25.333333
auto(s6)      25.187500
manual(m5)    26.293103
manual(m6)    24.210526
Name: hwy, dtype: float64
No description has been provided for this image
In [276]:
# print('Q5) 구동방식별 고속도로, 도시연비 평균을 서브셋을 만들고')
# print('시각화 - multi bar ') 

avg = mpgFrm.groupby('drv')[['cty', 'hwy']].mean()
print(avg)

avg.plot(kind='bar', figsize=(15,5))

plt.show()
plt.close()
           cty        hwy
drv                      
4    14.330097  19.174757
f    19.971698  28.160377
r    14.080000  21.000000
No description has been provided for this image
In [280]:
# print('Q6) 해당 클래스별 빈도수를 시각화 - ')

class_count = mpgFrm['class'].value_counts()
print(class_count)

plt.figure(figsize=(8,5))
class_count.plot(kind='bar')
plt.title('자동차 클래스별 빈도수')
plt.xlabel('차종 (class)')
plt.ylabel('빈도수 (count)')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()
plt.close()
class
suv           62
compact       47
midsize       41
subcompact    35
pickup        33
minivan       11
2seater        5
Name: count, dtype: int64
No description has been provided for this image
In [ ]: