학습목표¶
- 시각화 패키지 matplotlib, seaborn, folium
- 서브 패키지 pyplot, plotly
- 웹 시각화 streamlit
In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import json
# warning 제거
import warnings
warnings.filterwarnings('ignore')
# version check
print('numpy version - ' , np.__version__)
print('pandas version - ' , pd.__version__)
# 데이터 정보 출력 함수
def aryInfo(ary) :
print('type - ' , type(ary))
print('shape - ' , ary.shape)
print('ndim - ' , ary.ndim)
print('dtype - ' , ary.dtype)
print()
print('data -')
print(ary)
def seriesInfo(s) :
print('type - ' , type(s))
print('index - ' , s.index)
print('values - ' , s.values)
print('dtype - ' , s.dtype)
print()
print('data - ')
print(s)
def frmInfo(frm) :
print('type - ' , type(frm))
print('shape - ' , frm.shape)
print('ndim - ' , frm.ndim)
print('row idx - ' , frm.index , type(frm.index))
print('col idx - ' , frm.columns , type(frm.columns))
print('values - ' , type(frm.values))
print(frm.values)
print('data - ')
print(frm)
numpy version - 2.1.3 pandas version - 2.2.3
In [2]:
%matplotlib inline
# 한글 폰트 문제 해결
import platform
from matplotlib import font_manager, rc
plt.rcParams['axes.unicode_minus'] = False
if platform.system() == 'Darwin':
rc('font', family='AppleGothic')
elif platform.system() == 'Windows':
path = "c:/Windows/Fonts/malgun.ttf"
font_name = font_manager.FontProperties(fname=path).get_name()
rc('font', family=font_name)
else:
print('Unknown system... sorry~~~~')
# 차트 축 <- 음수 부호 지원
import matplotlib
matplotlib.rcParams['axes.unicode_minus'] = False
In [11]:
plt.figure()
# plt.plot([1,2,3,4,5,6,7,8,9])
# plt.plot([1,4,9,5,6,7,2,7,9])
plt.plot([10, 30, 60, 90], [1,4,9,16], color = 'red', marker ='o', ms = 15)
plt.title('라인 플롯 - ')
plt.xlabel('x 축')
plt.ylabel('y 축', rotation = 45)
plt.xlim(0, 100)
plt.ylim(0,17)
plt.grid()
plt.show()
plt.close()
In [19]:
# 서브플롯을 이용해서 한 화면에 여러개의 플롯을 그릴 수 있다.
fig = plt.figure(figsize = (20,7))
area01 = fig.add_subplot(1, 3, 1)
area01.set_title('타이틀')
area01.set_xlabel('x 축')
area01.set_ylabel('y 축', rotation=0)
area02 = fig.add_subplot(1, 3, 2)
area02.set_title('타이틀')
area02.set_xlabel('x 축')
area02.set_ylabel('y 축', rotation=0)
area03 = fig.add_subplot(1, 3, 3)
area03.set_title('타이틀')
area03.set_xlabel('x 축')
area03.set_ylabel('y 축', rotation=0)
plt.show()
plt.close()
In [20]:
print('bar char : x 축이 범주형타입(category)')
titanicFrm = sns.load_dataset('titanic')
titanicFrm.info()
bar char : x 축이 범주형타입(category) <class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 survived 891 non-null int64 1 pclass 891 non-null int64 2 sex 891 non-null object 3 age 714 non-null float64 4 sibsp 891 non-null int64 5 parch 891 non-null int64 6 fare 891 non-null float64 7 embarked 889 non-null object 8 class 891 non-null category 9 who 891 non-null object 10 adult_male 891 non-null bool 11 deck 203 non-null category 12 embark_town 889 non-null object 13 alive 891 non-null object 14 alone 891 non-null bool dtypes: bool(2), category(2), float64(2), int64(4), object(5) memory usage: 80.7+ KB
In [33]:
# Quiz
# 선실등급별 생존자 합을 시각화 한다면?
# titanicFrm.groupby('pclass')['survived'].groups
# titanicFrm.groupby('pclass')['survived'].sum().index
titanicFrm.groupby('pclass')['survived'].sum().values
Out[33]:
array([136, 87, 119])
In [41]:
plt.figure(figsize = (15,5))
plt.bar(titanicFrm.groupby('pclass')['survived'].sum().index,
titanicFrm.groupby('pclass')['survived'].sum().values)
plt.xticks(titanicFrm.groupby('pclass')['survived'].sum().index)
plt.title('선실 등급별 생존자 - ')
plt.xlabel('선실등급')
plt.ylabel('선실 등급별 생존자', rotation = 45)
plt.show()
plt.close()
In [42]:
# 간단한 시각화를 위해서 더미 데이터 세트를 만들어 보자
# 로그인 로그 데이터(timestamp, user, ip, status, delay_ms)
In [50]:
# timestamp : pd.date_range('2025-11-06', periods=100, freq='H')
# user : np.random.choice(['admin', 'superAdmin', 'root', 'guest', 'analyst'], 100)
# ip : np.random.choice(['192.168.0.1', '192.168.0.3', '192.168.0.5', '192.168.0.7', '192.168.0.9'], 100)
# status : np.random.choice(['success', 'fail'], 100, p=[0.6, 0.4])
# delay_ms : np.random.randint(20, 800, 100)
In [186]:
frm = pd.DataFrame({
"timestamp" : pd.date_range('2025-11-06', periods=100, freq='H'),
"user" : np.random.choice(['admin', 'superAdmin', 'root', 'guest', 'analyst'], 100),
"ip" : np.random.choice(['192.168.0.1', '192.168.0.3', '192.168.0.5', '192.168.0.7', '192.168.0.9'], 100),
"status" : np.random.choice(['success', 'fail'], 100, p=[0.6, 0.4]),
"delay_ms" : np.random.randint(20, 800, 100)
})
frm.head()
Out[186]:
| timestamp | user | ip | status | delay_ms | |
|---|---|---|---|---|---|
| 0 | 2025-11-06 00:00:00 | root | 192.168.0.3 | fail | 257 |
| 1 | 2025-11-06 01:00:00 | admin | 192.168.0.7 | fail | 688 |
| 2 | 2025-11-06 02:00:00 | analyst | 192.168.0.3 | success | 559 |
| 3 | 2025-11-06 03:00:00 | analyst | 192.168.0.3 | success | 688 |
| 4 | 2025-11-06 04:00:00 | guest | 192.168.0.3 | fail | 459 |
In [74]:
# Quiz
# 로그인 시도 상태별 횟수를 bar plot 이용하여 시각화
plt.figure(figsize = (15,5))
plt.bar(frm['status'].value_counts().index,
frm['status'].value_counts().values,
color = ['green', 'red'])
plt.xticks(frm['status'].value_counts().index)
plt.title('로그인 시도 상태 - ')
plt.xlabel('상태')
plt.ylabel('시도횟수', rotation = 45)
plt.show()
plt.close()
# frm.groupby('status').count()
# frm['status'].value_counts()
In [81]:
# Quiz
# 시간대별 평균 지연시간을 line plot 시각화 -
# type(frm['timestamp'])
hour = frm['timestamp'].dt.hour
plt.figure(figsize=(15,5))
plt.plot(frm.groupby(frm['timestamp'].dt.hour)['delay_ms'].mean().index,
frm.groupby(frm['timestamp'].dt.hour)['delay_ms'].mean().values)
plt.xticks(frm.groupby(frm['timestamp'].dt.hour)['delay_ms'].mean().index)
plt.title('시간대별 평균 지연시간')
plt.xlabel('시간대')
plt.ylabel('지연시간',rotation = 0)
plt.show()
plt.close()
In [82]:
irisFrm = sns.load_dataset('iris')
irisFrm.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 150 entries, 0 to 149 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 sepal_length 150 non-null float64 1 sepal_width 150 non-null float64 2 petal_length 150 non-null float64 3 petal_width 150 non-null float64 4 species 150 non-null object dtypes: float64(4), object(1) memory usage: 6.0+ KB
In [83]:
irisFrm.head()
Out[83]:
| sepal_length | sepal_width | petal_length | petal_width | species | |
|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
In [100]:
# Quiz
# 품종을 기준으로 그룹화 후 막대그래프로 시각화
speciesFrm = irisFrm.groupby('species').mean()
plt.figure()
#speciesFrm.plot(kind='bar')
speciesFrm.T.plot(kind='bar')
plt.legend(loc = 'best')
plt.xticks(rotation = 0)
plt.show()
plt.close()
<Figure size 640x480 with 0 Axes>
In [104]:
# histogram
# 연속형 데이터의 분포(distribution), 일정한 구간(bin)
# 해당 구간에 포함되는 데이터의 개수를 세어서 막대형태로 표현
# Quiz 로그인 지연 분포 확인
plt.figure(figsize=(15,5))
plt.hist(frm['delay_ms'], bins=20)
plt.title('로그인 지연 분포')
plt.xlabel('delay(ms)')
plt.ylabel('Freq',rotation = 0)
plt.show()
plt.close()
In [108]:
# countplot
# 사용자별 로그인 시도 패턴
plt.figure(figsize = (15,5))
sns.countplot(x = 'user',hue = 'status', data = frm, palette = 'coolwarm')
plt.show()
plt.close()
In [147]:
# box plot : 이상치(outlier) 탐지를 위한 시각화 도구
# 데이터의 중심(median), 퍼짐(사분위수), 이상치(outlier)를 한눈에 보여줌
# Q1(25%), Q2(50%), Q3(75%)
# IQR(Inner Quartile Range) : Q3 - Q1
# lower bound = Q1 - 1.5 * IQR , upper bound : Q3 + 1.5 * IQR
# 판정기준 값 < lower bound : 하한 이상치 , 값 > upper bound : 상한 이상치
# whisker(수염) : IQR 1.5배 범위 내 데이터
In [173]:
boxFrm = pd.DataFrame({
"user" : np.random.choice(['admin', 'root', 'guest'], 100),
"delay_ms" : np.concatenate([
np.random.normal(200, 50, 80),
np.random.normal(800, 20, 10),
np.random.normal(100, 20, 10)
])
})
boxFrm.head()
Out[173]:
| user | delay_ms | |
|---|---|---|
| 0 | root | 160.264908 |
| 1 | root | 210.451747 |
| 2 | root | 134.348377 |
| 3 | guest | 185.193624 |
| 4 | guest | 255.948266 |
In [155]:
# 정규분포 더미 데이터
# np.random.normal(200, 50, 80)
# np.random.normal(800, 20, 10)
# np.random.normal(100, 20, 10)
np.concatenate([
np.random.normal(200, 50, 80),
np.random.normal(800, 20, 10),
np.random.normal(100, 20, 10)
])
Out[155]:
array([292.51640763, 168.57420747, 290.69907913, 244.46286972,
177.34253952, 217.46538652, 129.25760432, 205.74352218,
248.47891827, 268.54624142, 173.97877908, 210.92703789,
141.63760471, 160.8589941 , 199.30024495, 267.19403124,
135.90760527, 242.63479587, 209.26242116, 263.14823025,
200.09712326, 176.7684967 , 99.91410234, 271.96073546,
284.21562685, 93.86541221, 149.01866832, 204.04893058,
84.11781713, 144.5350304 , 181.48100725, 236.14604557,
129.17758147, 150.0605812 , 184.9505203 , 238.19389845,
206.05383098, 188.15616798, 198.81813538, 178.78378535,
254.04604323, 175.23122605, 221.80939747, 285.99845825,
279.03247656, 272.61639641, 144.92982197, 219.8355202 ,
125.3305636 , 183.84997572, 226.30474703, 224.62590732,
193.49746878, 258.78868177, 152.21134589, 173.43409163,
189.21475311, 190.51262847, 298.0115558 , 220.34321991,
178.0259904 , 238.45438115, 234.10971496, 84.28255466,
156.5164555 , 185.31665669, 139.70191482, 169.02536862,
125.87790124, 226.99320902, 245.44084644, 197.65665821,
224.25296061, 197.56926419, 304.57799871, 272.67420392,
295.74316342, 230.25838562, 268.21286954, 177.63697886,
795.61537162, 837.26440743, 783.02674952, 764.35211196,
797.14154998, 800.56453445, 795.74691275, 797.2859874 ,
791.94023827, 765.83006055, 99.58661875, 129.57059035,
125.54431818, 53.75850928, 105.25341529, 88.87525094,
146.77941659, 98.99170167, 72.98629229, 89.4745997 ])
In [174]:
boxFrm.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 100 entries, 0 to 99 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 user 100 non-null object 1 delay_ms 100 non-null float64 dtypes: float64(1), object(1) memory usage: 1.7+ KB
In [175]:
boxFrm['delay_ms'].describe()
Out[175]:
count 100.000000 mean 256.293914 std 196.997663 min 52.234947 25% 159.336635 50% 201.314419 75% 252.315077 max 864.892806 Name: delay_ms, dtype: float64
In [176]:
# IQR
Q1 = boxFrm['delay_ms'].quantile(0.25)
print('Q1 - ', Q1)
Q3 = boxFrm['delay_ms'].quantile(0.75)
print('Q3 - ', Q3)
IQR = Q3 - Q1
print('IQR - ', IQR)
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
print(f'lower {lower_bound} , upper {upper_bound}')
Q1 - 159.33663457474654 Q3 - 252.31507680778475 IQR - 92.97844223303821 lower 19.868971225189227 , upper 391.78274015734206
In [177]:
print('이상치 탐지 - ')
outliers = boxFrm[ (boxFrm['delay_ms'] < lower_bound) | (boxFrm['delay_ms'] > upper_bound) ]
print(outliers)
이상치 탐지 -
user delay_ms
80 admin 864.892806
81 root 821.593476
82 admin 839.315056
83 root 789.554091
84 admin 804.741641
85 admin 777.361475
86 admin 823.628389
87 admin 830.470144
88 guest 790.787516
89 admin 828.703043
In [178]:
# box plot : 이상치(outlier) 탐지를 위한 시각화 도구
# 로그인 지연시간 이상치 탐지
plt.figure(figsize = (15, 5))
sns.boxplot(x = 'delay_ms', data = boxFrm, color = 'gray')
sns.stripplot(x = 'delay_ms', data = boxFrm, color = 'red', jitter = True, alpha = 0.5)
plt.show()
plt.close()
In [179]:
# 산점도(scatter plot)
# 두 개의 연속형 변수 간의 관계를 시각화
# x : 독립변수(feature), y : 종속변수(target)
# 퍼짐의 정도
# 점들이 어떤 패턴(선형, 곡선, 군집)을 이루는지 보면서 변수간의 관계를 파악하기 위한 시각화
In [185]:
plt.figure()
x = [1,2,3,4,5,6,7,8,9]
y = [1,4,9,5,6,7,2,7,9]
plt.scatter(x,y, color = 'red', s = 5, alpha = 0.7, marker = 'o')
plt.grid(False)
plt.show()
plt.close()
In [189]:
# Quiz
# 사용자별 로그인 시도 패턴을 산점도로 시각화하고 싶다 (시도횟수)
# 각 점은 : 사용자
# x : 평균 로그인 지연시간
# y : 실패율(failRatio)
# insight : 비정상적인 사용자 행동 패턴을 탐지할 수 있다.
Out[189]:
<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001CBA2687890>
In [199]:
scatterFrm = pd.DataFrame({
"timestamp" : pd.date_range('2025-11-06', periods=100, freq='H'),
"user" : np.random.choice(['admin', 'superAdmin', 'root', 'guest', 'analyst'], 100),
"ip" : np.random.choice(['192.168.0.1', '192.168.0.3', '192.168.0.5', '192.168.0.7', '192.168.0.9'], 100),
"status" : np.random.choice(['success', 'fail'], 100, p=[0.6, 0.4]),
"delay_ms" : np.random.randint(20, 800, 100)
})
scatterFrm.head()
Out[199]:
| timestamp | user | ip | status | delay_ms | |
|---|---|---|---|---|---|
| 0 | 2025-11-06 00:00:00 | guest | 192.168.0.9 | fail | 726 |
| 1 | 2025-11-06 01:00:00 | superAdmin | 192.168.0.3 | success | 798 |
| 2 | 2025-11-06 02:00:00 | admin | 192.168.0.3 | success | 585 |
| 3 | 2025-11-06 03:00:00 | analyst | 192.168.0.5 | success | 97 |
| 4 | 2025-11-06 04:00:00 | admin | 192.168.0.1 | success | 756 |
In [220]:
# plt.figure()
# x = scatterFrm.groupby('user')['delay_ms'].mean()
# y = scatterFrm.groupby('user')['status'].apply(lambda x : x.value_counts()['fail']/x.value_counts().sum())
# plt.scatter(x, y, color='red', s=5, marker='o')
# plt.show()
# plt.close()
avg = scatterFrm.groupby('user')['delay_ms'].mean()
# print(x)
failRatio = scatterFrm.groupby('user')['status'].apply(lambda x : (x=='fail').mean())
# print(y)
attempts = scatterFrm['user'].value_counts()
# print(scatterFrm['user'].value_counts())
userStatus = pd.DataFrame({
'avg' : avg,
'failRatio' : failRatio,
'attempts' : attempts
});
plt.figure(figsize = (15,5))
sns.scatterplot(x='avg',
y='failRatio',
data = userStatus,
size = 'attempts',
hue='user')
plt.show()
plt.close()
In [232]:
# heatmap
corr = irisFrm.corr(numeric_only=True)
print(corr)
plt.figure(figsize=(15,5))
sns.heatmap(corr, fmt='.2f', annot=True, linewidth=0.5)
plt.show()
plt.close()
sepal_length sepal_width petal_length petal_width sepal_length 1.000000 -0.117570 0.871754 0.817941 sepal_width -0.117570 1.000000 -0.428440 -0.366126 petal_length 0.871754 -0.428440 1.000000 0.962865 petal_width 0.817941 -0.366126 0.962865 1.000000
In [242]:
# frm 데이터를 이용해서 히트맵 시각화
# Quiz
# 사용자-상태별 평균 지연시간
pivot = frm.pivot_table(index='user', columns='status', values='delay_ms', aggfunc='mean')
# print(pivot)
plt.figure(figsize=(15,5))
sns.heatmap(pivot, fmt='.2f', annot=True, linewidth=0.5)
plt.show()
plt.close()
In [243]:
# Quiz
In [246]:
mpgFrm = pd.read_excel('./data/mpg_visualization.xlsx',
index_col = 0)
mpgFrm.head()
Out[246]:
| manufacturer | model | displ | year | cyl | trans | drv | cty | hwy | fl | class | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | audi | a4 | 1.8 | 1999 | 4 | auto(l5) | f | 18 | 29 | p | compact |
| 2 | audi | a4 | 1.8 | 1999 | 4 | manual(m5) | f | 21 | 29 | p | compact |
| 3 | audi | a4 | 2.0 | 2008 | 4 | manual(m6) | f | 20 | 31 | p | compact |
| 4 | audi | a4 | 2.0 | 2008 | 4 | auto(av) | f | 21 | 30 | p | compact |
| 5 | audi | a4 | 2.8 | 1999 | 6 | auto(l5) | f | 16 | 26 | p | compact |
In [261]:
# print('Q1) 배기량(displ)에 따른 고속연비를 확인하고 한다')
# print('배기량 4 이하인 자동차와 5이상인 자동차 중 고속도로 평균연비가 높은지를 확인한다면')
avg = mpgFrm.groupby(mpgFrm['displ'] >= 5)['hwy'].mean()
#print(avg)
plt.figure(figsize=(15,5))
avg.index = ['5미만', '5이상']
plt.bar(avg.index,
avg.values)
plt.show()
plt.close()
In [265]:
# print('Q2) 자동차 제조사에 따른 도시 연비를 비교할려고 한다')
# print('audi , toyota 두 회사의 모든 차종에 대한 도시연비 평균을 비교 - ')
audi = mpgFrm[mpgFrm['manufacturer'] == 'audi']['cty'].mean()
toyota = mpgFrm[mpgFrm['manufacturer'] == 'toyota']['cty'].mean()
avg = pd.Series([audi, toyota], index=['audi', 'toyota'])
plt.figure(figsize=(15,5))
plt.bar(avg.index, avg.values)
plt.show()
plt.close()
In [266]:
# print('Q3) chevrolet, ford, honda 제조사의 모든 차종에 대한 고속도로 연비 평균을 시각화')
chevrolet = mpgFrm[mpgFrm['manufacturer'] == 'chevrolet']['hwy'].mean()
ford = mpgFrm[mpgFrm['manufacturer'] == 'ford']['hwy'].mean()
honda = mpgFrm[mpgFrm['manufacturer'] == 'honda']['hwy'].mean()
avg = pd.Series([chevrolet, ford, honda], index=['chevrolet', 'ford', 'honda'])
plt.figure(figsize=(15,5))
plt.bar(avg.index, avg.values)
plt.show()
plt.close()
In [272]:
# print('Q4)구동방식별 고속도로연비평균을 막대 그래프로 시각화 - ')
trans = mpgFrm.groupby('trans')['hwy'].mean()
print(trans)
plt.figure(figsize=(15,5))
plt.bar(trans.index, trans.values)
plt.show()
plt.close()
trans auto(av) 27.800000 auto(l3) 27.000000 auto(l4) 21.963855 auto(l5) 20.717949 auto(l6) 20.000000 auto(s4) 25.666667 auto(s5) 25.333333 auto(s6) 25.187500 manual(m5) 26.293103 manual(m6) 24.210526 Name: hwy, dtype: float64
In [276]:
# print('Q5) 구동방식별 고속도로, 도시연비 평균을 서브셋을 만들고')
# print('시각화 - multi bar ')
avg = mpgFrm.groupby('drv')[['cty', 'hwy']].mean()
print(avg)
avg.plot(kind='bar', figsize=(15,5))
plt.show()
plt.close()
cty hwy drv 4 14.330097 19.174757 f 19.971698 28.160377 r 14.080000 21.000000
In [280]:
# print('Q6) 해당 클래스별 빈도수를 시각화 - ')
class_count = mpgFrm['class'].value_counts()
print(class_count)
plt.figure(figsize=(8,5))
class_count.plot(kind='bar')
plt.title('자동차 클래스별 빈도수')
plt.xlabel('차종 (class)')
plt.ylabel('빈도수 (count)')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()
plt.close()
class suv 62 compact 47 midsize 41 subcompact 35 pickup 33 minivan 11 2seater 5 Name: count, dtype: int64
In [ ]: