학습목표¶
- 데이터 조작(잘 꺼내야된다!!)
- 프레임은 기본 열 인덱스를 사용하고, 행 인덱스를 위해서는 슬라이싱
- loc[] 라벨값, iloc[] 정수값 : 인덱서
In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import json
# warning 제거
import warnings
warnings.filterwarnings('ignore')
# version check
print('numpy version - ' , np.__version__)
print('pandas version - ' , pd.__version__)
# 데이터 정보 출력 함수
def aryInfo(ary) :
print('type - ' , type(ary))
print('shape - ' , ary.shape)
print('ndim - ' , ary.ndim)
print('dtype - ' , ary.dtype)
print()
print('data -')
print(ary)
def seriesInfo(s) :
print('type - ' , type(s))
print('index - ' , s.index)
print('values - ' , s.values)
print('dtype - ' , s.dtype)
print()
print('data - ')
print(s)
def frmInfo(frm) :
print('type - ' , type(frm))
print('shape - ' , frm.shape)
print('ndim - ' , frm.ndim)
print('row idx - ' , frm.index , type(frm.index))
print('col idx - ' , frm.columns , type(frm.columns))
print('values - ' , type(frm.values))
print(frm.values)
print('data - ')
print(frm)
numpy version - 2.1.3 pandas version - 2.2.3
In [51]:
# 프레임은 데이터가 있는 열의 집합
scores = {
'kor' : [90,85,100,88,78],
'eng' : [90,85,100,88,78],
'mat' : [90,85,100,88,78]
} # 프레임으로 만들면 행이 5개 열이 3개 생성
frm = pd.DataFrame(scores,
index = ['강승우', '최호준', '임정섭', '이현우', '오신호'])
frmInfo(frm)
type - <class 'pandas.core.frame.DataFrame'>
shape - (5, 3)
ndim - 2
row idx - Index(['강승우', '최호준', '임정섭', '이현우', '오신호'], dtype='object') <class 'pandas.core.indexes.base.Index'>
col idx - Index(['kor', 'eng', 'mat'], dtype='object') <class 'pandas.core.indexes.base.Index'>
values - <class 'numpy.ndarray'>
[[ 90 90 90]
[ 85 85 85]
[100 100 100]
[ 88 88 88]
[ 78 78 78]]
data -
kor eng mat
강승우 90 90 90
최호준 85 85 85
임정섭 100 100 100
이현우 88 88 88
오신호 78 78 78
In [52]:
frm
Out[52]:
| kor | eng | mat | |
|---|---|---|---|
| 강승우 | 90 | 90 | 90 |
| 최호준 | 85 | 85 | 85 |
| 임정섭 | 100 | 100 | 100 |
| 이현우 | 88 | 88 | 88 |
| 오신호 | 78 | 78 | 78 |
In [53]:
# Quiz
# 모든 학생의 과목평균 점수를 새로운 열('mean') 추가하고 싶다면?
# axis = 0 (열), axis = 1 (행)
frm.values
Out[53]:
array([[ 90, 90, 90],
[ 85, 85, 85],
[100, 100, 100],
[ 88, 88, 88],
[ 78, 78, 78]])
In [69]:
frm['mean'] = np.mean(frm.values, axis=1).astype(np.int32)
frm
Out[69]:
| kor | eng | mat | mean | |
|---|---|---|---|---|
| 강승우 | 90 | 90 | 90 | 90 |
| 최호준 | 85 | 90 | 85 | 86 |
| 임정섭 | 100 | 100 | 100 | 100 |
| 이현우 | 88 | 88 | 88 | 88 |
| 오신호 | 78 | 78 | 78 | 78 |
In [70]:
# Quiz
# 최호준 학생의 영어점수를 90점으로 수정하고 평균 점수도 다시 계산
# iloc[], loc[] 행 인덱싱에서 사용하는 함수
frm.loc['최호준', 'eng'] = 90
frm['mean'] = np.mean(frm.values, axis=1).astype(np.int32)
frm
Out[70]:
| kor | eng | mat | mean | |
|---|---|---|---|---|
| 강승우 | 90 | 90 | 90 | 90 |
| 최호준 | 85 | 90 | 85 | 86 |
| 임정섭 | 100 | 100 | 100 | 100 |
| 이현우 | 88 | 88 | 88 | 88 |
| 오신호 | 78 | 78 | 78 | 78 |
In [78]:
# 시리즈
lee = frm.loc['이현우']
print(lee)
print('type - ', type(lee))
kor 88 eng 88 mat 88 mean 88 Name: 이현우, dtype: int64 type - <class 'pandas.core.series.Series'>
In [79]:
# 데이터 프레임
lee = frm.loc[['이현우']]
print(lee)
print('type - ', type(lee))
kor eng mat mean 이현우 88 88 88 88 type - <class 'pandas.core.frame.DataFrame'>
In [84]:
lim = frm.loc['임정섭', 'kor' : 'eng']
print(lim)
print('type - ', type(lim))
kor 100 eng 100 Name: 임정섭, dtype: int64 type - <class 'pandas.core.series.Series'>
- titanic dataset(seaborn)
In [135]:
titanicRawData = sns.load_dataset('titanic')
print('type - ', type(titanicRawData))
titanicRawData.head()
type - <class 'pandas.core.frame.DataFrame'>
Out[135]:
| survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | Third | man | True | NaN | Southampton | no | False |
| 1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C | First | woman | False | C | Cherbourg | yes | False |
| 2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S | Third | woman | False | NaN | Southampton | yes | True |
| 3 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S | First | woman | False | C | Southampton | yes | False |
| 4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S | Third | man | True | NaN | Southampton | no | True |
In [107]:
titanicRawData.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 survived 891 non-null int64 1 pclass 891 non-null int64 2 sex 891 non-null object 3 age 714 non-null float64 4 sibsp 891 non-null int64 5 parch 891 non-null int64 6 fare 891 non-null float64 7 embarked 889 non-null object 8 class 891 non-null category 9 who 891 non-null object 10 adult_male 891 non-null bool 11 deck 203 non-null category 12 embark_town 889 non-null object 13 alive 891 non-null object 14 alone 891 non-null bool 15 age_by_10 891 non-null int64 dtypes: bool(2), category(2), float64(2), int64(5), object(5) memory usage: 87.6+ KB
In [96]:
# Quiz
# 선실등급(pclass)의 인원수를 확인하고 싶다면?
print('유일값 확인 - unique()')
print(titanicRawData['pclass'].unique())
print(titanicRawData['pclass'].value_counts().values)
유일값 확인 - unique() [3 1 2] [491 216 184]
In [99]:
# 데이터 프레임의 컬럼명 확인
print('type - ', type(titanicRawData.columns))
print(titanicRawData.columns)
type - <class 'pandas.core.indexes.base.Index'>
Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
'alive', 'alone'],
dtype='object')
In [108]:
# Quiz
# 기존나이에 10살을 더해서 age_by_10 열을 추가하고 싶다면?
titanicRawData['age_by_10'] = (titanicRawData['age'].values + 10).astype('int')
titanicRawData.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 survived 891 non-null int64 1 pclass 891 non-null int64 2 sex 891 non-null object 3 age 714 non-null float64 4 sibsp 891 non-null int64 5 parch 891 non-null int64 6 fare 891 non-null float64 7 embarked 889 non-null object 8 class 891 non-null category 9 who 891 non-null object 10 adult_male 891 non-null bool 11 deck 203 non-null category 12 embark_town 889 non-null object 13 alive 891 non-null object 14 alone 891 non-null bool 15 age_by_10 891 non-null int64 dtypes: bool(2), category(2), float64(2), int64(5), object(5) memory usage: 87.6+ KB
In [109]:
# age_by_10 컬럼을 삭제하고 싶다면?
# drop( , axis = 1, inplace = True)
titanicRawData.drop(['age_by_10'], axis = 1 , inplace = True)
In [131]:
titanicRawData.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 survived 891 non-null int64 1 pclass 891 non-null int64 2 sex 891 non-null object 3 age 714 non-null float64 4 sibsp 891 non-null int64 5 parch 891 non-null int64 6 fare 891 non-null float64 7 embarked 889 non-null object 8 class 891 non-null category 9 who 891 non-null object 10 adult_male 891 non-null bool 11 deck 203 non-null category 12 embark_town 889 non-null object 13 alive 891 non-null object 14 alone 891 non-null bool dtypes: bool(2), category(2), float64(2), int64(4), object(5) memory usage: 80.7+ KB
In [132]:
# 요금(fare)에 대한 통계(최대, 최소, 평균, 합계) 확인이 필요하다면?
print('fare max - ', np.max(titanicRawData['fare'].values))
print('fare min - ', np.min(titanicRawData['fare'].values))
print('fare mean - ', np.mean(titanicRawData['fare'].values))
fare max - 512.3292 fare min - 0.0 fare mean - 32.204207968574636
In [136]:
# Quiz
# 선실등급(pclass)이 3등급인 데이터만의 subset 만들고 싶다면?
pclassSubsetFrm = titanicRawData[titanicRawData['pclass'] == 3]
print('type - ', type(pclassSubsetFrm))
print(pclassSubsetFrm)
type - <class 'pandas.core.frame.DataFrame'>
survived pclass sex age sibsp parch fare embarked class \
0 0 3 male 22.0 1 0 7.2500 S Third
2 1 3 female 26.0 0 0 7.9250 S Third
4 0 3 male 35.0 0 0 8.0500 S Third
5 0 3 male NaN 0 0 8.4583 Q Third
7 0 3 male 2.0 3 1 21.0750 S Third
.. ... ... ... ... ... ... ... ... ...
882 0 3 female 22.0 0 0 10.5167 S Third
884 0 3 male 25.0 0 0 7.0500 S Third
885 0 3 female 39.0 0 5 29.1250 Q Third
888 0 3 female NaN 1 2 23.4500 S Third
890 0 3 male 32.0 0 0 7.7500 Q Third
who adult_male deck embark_town alive alone
0 man True NaN Southampton no False
2 woman False NaN Southampton yes True
4 man True NaN Southampton no True
5 man True NaN Queenstown no True
7 child False NaN Southampton no False
.. ... ... ... ... ... ...
882 woman False NaN Southampton no True
884 man True NaN Southampton no True
885 woman False NaN Queenstown no False
888 woman False NaN Southampton no False
890 man True NaN Queenstown no True
[491 rows x 15 columns]
In [144]:
# Quiz
# 위 서브셋에서 성별과(sex), 생존여부(survived)만 가지는 새로운 서브셋을 만들고 싶다면?
newsubset = pclassSubsetFrm[['sex','survived']]
print(newsubset.head())
sex survived 0 male 0 1 female 1 2 male 0 3 male 0 4 male 0
In [151]:
# 인덱스 재조정
# reset_index()
pclassSubsetFrm.reset_index(inplace = True)
print(pclassSubsetFrm.head())
index survived pclass sex age sibsp parch fare embarked \ 0 0 0 3 male 22.0 1 0 7.2500 S 1 1 1 3 female 26.0 0 0 7.9250 S 2 2 0 3 male 35.0 0 0 8.0500 S 3 3 0 3 male NaN 0 0 8.4583 Q 4 4 0 3 male 2.0 3 1 21.0750 S class who adult_male deck embark_town alive alone 0 Third man True NaN Southampton no False 1 Third woman False NaN Southampton yes True 2 Third man True NaN Southampton no True 3 Third man True NaN Queenstown no True 4 Third child False NaN Southampton no False
In [152]:
print(pclassSubsetFrm)
index survived pclass sex age sibsp parch fare embarked \
0 0 0 3 male 22.0 1 0 7.2500 S
1 1 1 3 female 26.0 0 0 7.9250 S
2 2 0 3 male 35.0 0 0 8.0500 S
3 3 0 3 male NaN 0 0 8.4583 Q
4 4 0 3 male 2.0 3 1 21.0750 S
.. ... ... ... ... ... ... ... ... ...
486 486 0 3 female 22.0 0 0 10.5167 S
487 487 0 3 male 25.0 0 0 7.0500 S
488 488 0 3 female 39.0 0 5 29.1250 Q
489 489 0 3 female NaN 1 2 23.4500 S
490 490 0 3 male 32.0 0 0 7.7500 Q
class who adult_male deck embark_town alive alone
0 Third man True NaN Southampton no False
1 Third woman False NaN Southampton yes True
2 Third man True NaN Southampton no True
3 Third man True NaN Queenstown no True
4 Third child False NaN Southampton no False
.. ... ... ... ... ... ... ...
486 Third woman False NaN Southampton no True
487 Third man True NaN Southampton no True
488 Third woman False NaN Queenstown no False
489 Third woman False NaN Southampton no False
490 Third man True NaN Queenstown no True
[491 rows x 16 columns]
In [153]:
pclassSubsetFrm.drop('index', axis = 1 , inplace =True)
pclassSubsetFrm
Out[153]:
| survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | Third | man | True | NaN | Southampton | no | False |
| 1 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S | Third | woman | False | NaN | Southampton | yes | True |
| 2 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S | Third | man | True | NaN | Southampton | no | True |
| 3 | 0 | 3 | male | NaN | 0 | 0 | 8.4583 | Q | Third | man | True | NaN | Queenstown | no | True |
| 4 | 0 | 3 | male | 2.0 | 3 | 1 | 21.0750 | S | Third | child | False | NaN | Southampton | no | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 486 | 0 | 3 | female | 22.0 | 0 | 0 | 10.5167 | S | Third | woman | False | NaN | Southampton | no | True |
| 487 | 0 | 3 | male | 25.0 | 0 | 0 | 7.0500 | S | Third | man | True | NaN | Southampton | no | True |
| 488 | 0 | 3 | female | 39.0 | 0 | 5 | 29.1250 | Q | Third | woman | False | NaN | Queenstown | no | False |
| 489 | 0 | 3 | female | NaN | 1 | 2 | 23.4500 | S | Third | woman | False | NaN | Southampton | no | False |
| 490 | 0 | 3 | male | 32.0 | 0 | 0 | 7.7500 | Q | Third | man | True | NaN | Queenstown | no | True |
491 rows × 15 columns
In [154]:
# set_index : 특정 컬럼을 인덱스로 변경하는 함수
pclassSubsetFrm.reset_index(inplace=True)
pclassSubsetFrm
Out[154]:
| index | survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | Third | man | True | NaN | Southampton | no | False |
| 1 | 1 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S | Third | woman | False | NaN | Southampton | yes | True |
| 2 | 2 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S | Third | man | True | NaN | Southampton | no | True |
| 3 | 3 | 0 | 3 | male | NaN | 0 | 0 | 8.4583 | Q | Third | man | True | NaN | Queenstown | no | True |
| 4 | 4 | 0 | 3 | male | 2.0 | 3 | 1 | 21.0750 | S | Third | child | False | NaN | Southampton | no | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 486 | 486 | 0 | 3 | female | 22.0 | 0 | 0 | 10.5167 | S | Third | woman | False | NaN | Southampton | no | True |
| 487 | 487 | 0 | 3 | male | 25.0 | 0 | 0 | 7.0500 | S | Third | man | True | NaN | Southampton | no | True |
| 488 | 488 | 0 | 3 | female | 39.0 | 0 | 5 | 29.1250 | Q | Third | woman | False | NaN | Queenstown | no | False |
| 489 | 489 | 0 | 3 | female | NaN | 1 | 2 | 23.4500 | S | Third | woman | False | NaN | Southampton | no | False |
| 490 | 490 | 0 | 3 | male | 32.0 | 0 | 0 | 7.7500 | Q | Third | man | True | NaN | Queenstown | no | True |
491 rows × 16 columns
In [155]:
pclassSubsetFrm.set_index('index', inplace=True)
pclassSubsetFrm
Out[155]:
| survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| index | |||||||||||||||
| 0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | Third | man | True | NaN | Southampton | no | False |
| 1 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S | Third | woman | False | NaN | Southampton | yes | True |
| 2 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S | Third | man | True | NaN | Southampton | no | True |
| 3 | 0 | 3 | male | NaN | 0 | 0 | 8.4583 | Q | Third | man | True | NaN | Queenstown | no | True |
| 4 | 0 | 3 | male | 2.0 | 3 | 1 | 21.0750 | S | Third | child | False | NaN | Southampton | no | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 486 | 0 | 3 | female | 22.0 | 0 | 0 | 10.5167 | S | Third | woman | False | NaN | Southampton | no | True |
| 487 | 0 | 3 | male | 25.0 | 0 | 0 | 7.0500 | S | Third | man | True | NaN | Southampton | no | True |
| 488 | 0 | 3 | female | 39.0 | 0 | 5 | 29.1250 | Q | Third | woman | False | NaN | Queenstown | no | False |
| 489 | 0 | 3 | female | NaN | 1 | 2 | 23.4500 | S | Third | woman | False | NaN | Southampton | no | False |
| 490 | 0 | 3 | male | 32.0 | 0 | 0 | 7.7500 | Q | Third | man | True | NaN | Queenstown | no | True |
491 rows × 15 columns
In [161]:
# Quiz
# 원본 데이터로부터 나이가 60이상이고 선실등급이 1등급이면서 성별이 여자인 데이터만 추출하여 서브셋을 만들고 싶다면?
sub = titanicRawData[(titanicRawData['age'] >= 60) & (titanicRawData['pclass'] == 1) & (titanicRawData['sex'] == 'female')]
print(sub)
survived pclass sex age sibsp parch fare embarked class \
275 1 1 female 63.0 1 0 77.9583 S First
366 1 1 female 60.0 1 0 75.2500 C First
829 1 1 female 62.0 0 0 80.0000 NaN First
who adult_male deck embark_town alive alone
275 woman False D Southampton yes False
366 woman False D Cherbourg yes False
829 woman False B NaN yes True
In [162]:
sub = titanicRawData[(titanicRawData['age'] >= 60) & (titanicRawData['pclass'] == 1) & (titanicRawData['sex'] == 'male')]
print(sub)
survived pclass sex age sibsp parch fare embarked class \
54 0 1 male 65.0 0 1 61.9792 C First
96 0 1 male 71.0 0 0 34.6542 C First
170 0 1 male 61.0 0 0 33.5000 S First
252 0 1 male 62.0 0 0 26.5500 S First
438 0 1 male 64.0 1 4 263.0000 S First
456 0 1 male 65.0 0 0 26.5500 S First
493 0 1 male 71.0 0 0 49.5042 C First
545 0 1 male 64.0 0 0 26.0000 S First
555 0 1 male 62.0 0 0 26.5500 S First
587 1 1 male 60.0 1 1 79.2000 C First
625 0 1 male 61.0 0 0 32.3208 S First
630 1 1 male 80.0 0 0 30.0000 S First
694 0 1 male 60.0 0 0 26.5500 S First
745 0 1 male 70.0 1 1 71.0000 S First
who adult_male deck embark_town alive alone
54 man True B Cherbourg no False
96 man True A Cherbourg no True
170 man True B Southampton no True
252 man True C Southampton no True
438 man True C Southampton no False
456 man True E Southampton no True
493 man True NaN Cherbourg no True
545 man True NaN Southampton no True
555 man True NaN Southampton no True
587 man True B Cherbourg yes False
625 man True D Southampton no True
630 man True A Southampton yes True
694 man True NaN Southampton no True
745 man True B Southampton no False
sort : 인덱스 기준, 열 값을 기준으로 # ascending = True(오름차순) False(내림차순)¶
- sort_index(axis = , ascending =)
- sort_values(by = , ascending =)
In [164]:
# 원본데이터로부터 승객의 나이를 기준으로 내림차순한 서브셋을 만들고 싶다면?
subsetFrm = titanicRawData.sort_values(by='age', ascending=False)
subsetFrm.reset_index(inplace = True)
subsetFrm.drop('index', axis = 1, inplace = True)
subsetFrm
Out[164]:
| survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | male | 80.0 | 0 | 0 | 30.0000 | S | First | man | True | A | Southampton | yes | True |
| 1 | 0 | 3 | male | 74.0 | 0 | 0 | 7.7750 | S | Third | man | True | NaN | Southampton | no | True |
| 2 | 0 | 1 | male | 71.0 | 0 | 0 | 49.5042 | C | First | man | True | NaN | Cherbourg | no | True |
| 3 | 0 | 1 | male | 71.0 | 0 | 0 | 34.6542 | C | First | man | True | A | Cherbourg | no | True |
| 4 | 0 | 3 | male | 70.5 | 0 | 0 | 7.7500 | Q | Third | man | True | NaN | Queenstown | no | True |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 0 | 3 | male | NaN | 0 | 0 | 7.2292 | C | Third | man | True | NaN | Cherbourg | no | True |
| 887 | 0 | 3 | female | NaN | 8 | 2 | 69.5500 | S | Third | woman | False | NaN | Southampton | no | False |
| 888 | 0 | 3 | male | NaN | 0 | 0 | 9.5000 | S | Third | man | True | NaN | Southampton | no | True |
| 889 | 0 | 3 | male | NaN | 0 | 0 | 7.8958 | S | Third | man | True | NaN | Southampton | no | True |
| 890 | 0 | 3 | female | NaN | 1 | 2 | 23.4500 | S | Third | woman | False | NaN | Southampton | no | False |
891 rows × 15 columns
In [168]:
print('성별에 따른 승객수를 시각화하기 위해서 정렬을 한다면 - ')
titanicRawData['sex'].value_counts().sort_values(ascending = False)
성별에 따른 승객수를 시각화하기 위해서 정렬을 한다면 -
Out[168]:
sex male 577 female 314 Name: count, dtype: int64
In [169]:
titanicRawData.sort_index(ascending= False)
Out[169]:
| survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 890 | 0 | 3 | male | 32.0 | 0 | 0 | 7.7500 | Q | Third | man | True | NaN | Queenstown | no | True |
| 889 | 1 | 1 | male | 26.0 | 0 | 0 | 30.0000 | C | First | man | True | C | Cherbourg | yes | True |
| 888 | 0 | 3 | female | NaN | 1 | 2 | 23.4500 | S | Third | woman | False | NaN | Southampton | no | False |
| 887 | 1 | 1 | female | 19.0 | 0 | 0 | 30.0000 | S | First | woman | False | B | Southampton | yes | True |
| 886 | 0 | 2 | male | 27.0 | 0 | 0 | 13.0000 | S | Second | man | True | NaN | Southampton | no | True |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S | Third | man | True | NaN | Southampton | no | True |
| 3 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S | First | woman | False | C | Southampton | yes | False |
| 2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S | Third | woman | False | NaN | Southampton | yes | True |
| 1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C | First | woman | False | C | Cherbourg | yes | False |
| 0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | Third | man | True | NaN | Southampton | no | False |
891 rows × 15 columns
- Pandas
- csv 파일 입출력과 데이터 조작 (excel, json, html, scraping 비정형 데이터)
- read_xxxxx() , to_xxxxx()
In [170]:
namesFrm = pd.read_csv('./data/year2022_baby_name.csv',
sep=',',
encoding='utf-8')
print('type - ', type(namesFrm))
type - <class 'pandas.core.frame.DataFrame'>
In [172]:
namesFrm.head()
Out[172]:
| NAME | GENDER | COUNT | |
|---|---|---|---|
| 0 | Isabella | F | 22731 |
| 1 | Sophia | F | 20477 |
| 2 | Emma | F | 17179 |
| 3 | Olivia | F | 16860 |
| 4 | Ava | F | 15300 |
In [173]:
namesFrm.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 33838 entries, 0 to 33837 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 NAME 33838 non-null object 1 GENDER 33838 non-null object 2 COUNT 33838 non-null int64 dtypes: int64(1), object(2) memory usage: 793.2+ KB
In [175]:
# namesFrm.describe() # 숫자데이터 통계정보
namesFrm.columns
Out[175]:
Index(['NAME', 'GENDER', 'COUNT'], dtype='object')
In [177]:
# Quiz
# count 열을 기준으로 내림차순 정렬하여 서브셋을 만든다면?
# 인덱스를 확인하고 reset시킨후
# 불필요한 'index' 열 삭제
subset = namesFrm.sort_values(by='COUNT', ascending=False)
subset.reset_index(inplace = True)
subset.drop('index', axis = 1, inplace = True)
subset
Out[177]:
| NAME | GENDER | COUNT | |
|---|---|---|---|
| 0 | Isabella | F | 22731 |
| 1 | Jacob | M | 21875 |
| 2 | Sophia | F | 20477 |
| 3 | Ethan | M | 17866 |
| 4 | Emma | F | 17179 |
| ... | ... | ... | ... |
| 33833 | Xaine | M | 5 |
| 33834 | Xaveon | M | 5 |
| 33835 | Xavious | M | 5 |
| 33836 | Xiomar | M | 5 |
| 33837 | Xylan | M | 5 |
33838 rows × 3 columns
In [191]:
# Quiz
# 열 이름을 변경하고자 하고( NAME -> name, GENDER -> gender , COUNT -> count)
# 성별(gender)이 남자인 데이터를 추출한다면?
# type(subset.columns.values)
# for col in subset.columns.values:
# print(col.lower())
cols = [ col.lower() for col in subset.columns.values]
subset.columns = cols
print(subset)
# subset.rename(columns = cols, inplace = True) # 리스트 형태라서 안됨 딕셔너리로 만들어줘야함
# subset[subset['gender']=='M']
subset.rename(columns = {col: col.lower() for col in subset.columns}, inplace = True) # 딕셔너리 컴프리헨션
subset[subset['gender']=='M']
# subset.rename(columns={'NAME': 'name', 'GENDER': 'gender', 'COUNT': 'count'}, inplace=True)
# M = subset[subset['gender'] == 'M']
# print(M)
name gender count 0 Isabella F 22731 1 Jacob M 21875 2 Sophia F 20477 3 Ethan M 17866 4 Emma F 17179 ... ... ... ... 33833 Xaine M 5 33834 Xaveon M 5 33835 Xavious M 5 33836 Xiomar M 5 33837 Xylan M 5 [33838 rows x 3 columns]
Out[191]:
| name | gender | count | |
|---|---|---|---|
| 1 | Jacob | M | 21875 |
| 3 | Ethan | M | 17866 |
| 5 | Michael | M | 17133 |
| 6 | Jayden | M | 17030 |
| 7 | William | M | 16870 |
| ... | ... | ... | ... |
| 33833 | Xaine | M | 5 |
| 33834 | Xaveon | M | 5 |
| 33835 | Xavious | M | 5 |
| 33836 | Xiomar | M | 5 |
| 33837 | Xylan | M | 5 |
14140 rows × 3 columns
통계량 확인¶
- groupby(열 인덱스 | 인덱스) : 데이터를 그룹으로 분할하여 독립된 그룹에 대해서 별도의 데이터 처리
- split, apply(lambda), combine, agg([count, sum, mean])
In [198]:
frm = pd.read_csv('./data/service_data_groupby_sample.csv',
encoding='cp949')
print('type - ', type(frm))
frm
type - <class 'pandas.core.frame.DataFrame'>
Out[198]:
| id | gender | height | age | region | |
|---|---|---|---|---|---|
| 0 | 1 | 남자 | 175 | 22 | 서울 |
| 1 | 2 | 여자 | 160 | 23 | 서울 |
| 2 | 3 | 여자 | 161 | 21 | 서울 |
| 3 | 4 | 여자 | 170 | 33 | 서울 |
| 4 | 5 | 여자 | 155 | 35 | 경기 |
| 5 | 6 | 남자 | 181 | 41 | 서울 |
| 6 | 7 | 남자 | 183 | 33 | 충북 |
| 7 | 8 | 여자 | 171 | 22 | 서울 |
| 8 | 9 | 남자 | 188 | 29 | 경기 |
| 9 | 10 | 남자 | 177 | 39 | 인천 |
| 10 | 11 | 여자 | 152 | 40 | 서울 |
| 11 | 12 | 남자 | 180 | 24 | 서울 |
In [194]:
frm.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 12 entries, 0 to 11 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 12 non-null int64 1 gender 12 non-null object 2 height 12 non-null int64 3 age 12 non-null int64 4 region 12 non-null object dtypes: int64(3), object(2) memory usage: 612.0+ bytes
In [203]:
# Quiz
# 지역별 나이평균을 확인하고 싶다면??
# type(frm.groupby('region').get_group('경기'))
# type(frm.groupby('region').get_group('경기')['age'])
frm.groupby('region')['age'].mean()
Out[203]:
region 경기 32.00 서울 28.25 인천 39.00 충북 33.00 Name: age, dtype: float64
In [209]:
# Quiz
# 성별을 기준으로 그룹을 나누고 싶다면 -
tmp = frm.groupby('gender')[['height']].mean()
tmp.reset_index(inplace = True)
tmp
Out[209]:
| gender | height | |
|---|---|---|
| 0 | 남자 | 180.666667 |
| 1 | 여자 | 161.500000 |
In [216]:
# 다중통계량 : agg()
subset = frm.drop('region', axis = 1)
subset
Out[216]:
| id | gender | height | age | |
|---|---|---|---|---|
| 0 | 1 | 남자 | 175 | 22 |
| 1 | 2 | 여자 | 160 | 23 |
| 2 | 3 | 여자 | 161 | 21 |
| 3 | 4 | 여자 | 170 | 33 |
| 4 | 5 | 여자 | 155 | 35 |
| 5 | 6 | 남자 | 181 | 41 |
| 6 | 7 | 남자 | 183 | 33 |
| 7 | 8 | 여자 | 171 | 22 |
| 8 | 9 | 남자 | 188 | 29 |
| 9 | 10 | 남자 | 177 | 39 |
| 10 | 11 | 여자 | 152 | 40 |
| 11 | 12 | 남자 | 180 | 24 |
In [215]:
subset.groupby('gender').agg(['mean', 'var', 'std'])
Out[215]:
| id | height | age | |||||||
|---|---|---|---|---|---|---|---|---|---|
| mean | var | std | mean | var | std | mean | var | std | |
| gender | |||||||||
| 남자 | 7.5 | 14.7 | 3.834058 | 180.666667 | 21.066667 | 4.589844 | 31.333333 | 60.266667 | 7.763161 |
| 여자 | 5.5 | 11.5 | 3.391165 | 161.500000 | 59.500000 | 7.713624 | 29.000000 | 64.400000 | 8.024961 |
In [218]:
frm.groupby('gender')['age'].agg(['max', 'min', 'mean', 'median']).reset_index()
Out[218]:
| gender | max | min | mean | median | |
|---|---|---|---|---|---|
| 0 | 남자 | 41 | 22 | 31.333333 | 31.0 |
| 1 | 여자 | 40 | 21 | 29.000000 | 28.0 |
In [223]:
# Quiz
# 성별에 따른 거주지의 최빈값(mode())을 확인하고 싶다면?
# frm.groupby('gender')['region'].agg(lambda x : x.mode())
frm.groupby('gender')['region'].apply(lambda x : x.mode())
Out[223]:
gender 남자 0 서울 여자 0 서울 Name: region, dtype: object
In [224]:
tipsFrm = sns.load_dataset('tips')
tipsFrm.head()
Out[224]:
| total_bill | tip | sex | smoker | day | time | size | |
|---|---|---|---|---|---|---|---|
| 0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
| 1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
| 2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
| 3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
| 4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
In [225]:
tipsFrm.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 244 entries, 0 to 243 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 total_bill 244 non-null float64 1 tip 244 non-null float64 2 sex 244 non-null category 3 smoker 244 non-null category 4 day 244 non-null category 5 time 244 non-null category 6 size 244 non-null int64 dtypes: category(4), float64(2), int64(1) memory usage: 7.4 KB
In [226]:
tipsFrm.columns
Out[226]:
Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')
In [228]:
titanicFrm = sns.load_dataset('titanic')
titanicFrm.head()
Out[228]:
| survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | Third | man | True | NaN | Southampton | no | False |
| 1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C | First | woman | False | C | Cherbourg | yes | False |
| 2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S | Third | woman | False | NaN | Southampton | yes | True |
| 3 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S | First | woman | False | C | Southampton | yes | False |
| 4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S | Third | man | True | NaN | Southampton | no | True |
In [234]:
# Quiz
# tipsFrm 전체 평균 팁은 얼마일까요?
tip = tipsFrm['tip'].mean()
print(tip)
2.99827868852459
In [240]:
# Quiz
# 남성과 여성 중 평균 팁이 더 높은 성별을 확인하고 싶다면?
print(tipsFrm.groupby('sex')['tip'].agg('mean').reset_index())
# 흡연자와 비흡연자 중 편균 팁 비율이 높은 그룹은?
# assign
print(tipsFrm.assign(pct = tipsFrm['tip'] / tipsFrm['total_bill']).groupby('smoker')['pct'].mean())
sex tip 0 Male 3.089618 1 Female 2.833448 smoker Yes 0.163196 No 0.159328 Name: pct, dtype: float64
In [250]:
# Quiz
# 팁이 가장 많이 발생하는 요일을 확인하고 싶다면?
result = tipsFrm.groupby('day')['tip'].sum().sort_values(ascending=False).head(1)
print('type - ', type(result))
print(result)
type - <class 'pandas.core.series.Series'> day Sat 260.4 Name: tip, dtype: float64
In [258]:
# Quiz
# ['Dinner', 'Lunch'] 중 평균 팁 비율이 높은 시간대를 확인하고 싶다면?
# tipsFrm['time'].unique()
# tipsFrm.groupby('time')['tip'].mean()
tipsFrm.assign(pct = tipsFrm['tip'] / tipsFrm['total_bill']).groupby('time')['pct'].mean().sort_values(ascending=False).head(1)
tipsFrm['pct'] = tipsFrm['tip'] / tipsFrm['total_bill']
result = tipsFrm.groupby('time')['pct'].mean().sort_values(ascending = False).head(1)
print(result)
time Lunch 0.164128 Name: pct, dtype: float64
In [268]:
# Quiz : titanicFrm
# subset - (age,sex,class,fare,survived)
# subset = titanicFrm[['age', 'sex', 'class', 'fare', 'survived']]
# subset.head()
subset = titanicFrm.loc[: ,['age', 'sex', 'class', 'fare', 'survived']]
subset.head()
Out[268]:
| age | sex | class | fare | survived | |
|---|---|---|---|---|---|
| 0 | 22.0 | male | Third | 7.2500 | 0 |
| 1 | 38.0 | female | First | 71.2833 | 1 |
| 2 | 26.0 | female | Third | 7.9250 | 1 |
| 3 | 35.0 | female | First | 53.1000 | 1 |
| 4 | 35.0 | male | Third | 8.0500 | 0 |
In [285]:
# Quiz
# 선실등급에 따른 그룹을 만들고 1등급 승객만 데이터 프레임 형식으로 만들어 본다면?
# firstClass = subset[subset['class'] == 'First']
# print(firstClass)
grp = subset.groupby('class')
# print(grp)
# print(grp.get_group('First'))
result = grp.get_group('First')
print(result)
#subset.loc[grp.groups['First'].values , : ]
age sex class fare survived 1 38.0 female First 71.2833 1 3 35.0 female First 53.1000 1 6 54.0 male First 51.8625 0 11 58.0 female First 26.5500 1 23 28.0 male First 35.5000 1 .. ... ... ... ... ... 871 47.0 female First 52.5542 1 872 33.0 male First 5.0000 0 879 56.0 female First 83.1583 1 887 19.0 female First 30.0000 1 889 26.0 male First 30.0000 1 [216 rows x 5 columns]
In [286]:
irisFrm = sns.load_dataset('iris')
irisFrm.head()
Out[286]:
| sepal_length | sepal_width | petal_length | petal_width | species | |
|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
In [287]:
irisFrm['species'].value_counts()
Out[287]:
species setosa 50 versicolor 50 virginica 50 Name: count, dtype: int64
In [288]:
grp = irisFrm.groupby('species')
print(grp)
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000014B9D9DF050>
In [289]:
print(grp.groups)
{'setosa': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49], 'versicolor': [50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99], 'virginica': [100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149]}
In [291]:
for key, group in grp :
print('key - ', key)
print()
display(group)
key - setosa
| sepal_length | sepal_width | petal_length | petal_width | species | |
|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
| 5 | 5.4 | 3.9 | 1.7 | 0.4 | setosa |
| 6 | 4.6 | 3.4 | 1.4 | 0.3 | setosa |
| 7 | 5.0 | 3.4 | 1.5 | 0.2 | setosa |
| 8 | 4.4 | 2.9 | 1.4 | 0.2 | setosa |
| 9 | 4.9 | 3.1 | 1.5 | 0.1 | setosa |
| 10 | 5.4 | 3.7 | 1.5 | 0.2 | setosa |
| 11 | 4.8 | 3.4 | 1.6 | 0.2 | setosa |
| 12 | 4.8 | 3.0 | 1.4 | 0.1 | setosa |
| 13 | 4.3 | 3.0 | 1.1 | 0.1 | setosa |
| 14 | 5.8 | 4.0 | 1.2 | 0.2 | setosa |
| 15 | 5.7 | 4.4 | 1.5 | 0.4 | setosa |
| 16 | 5.4 | 3.9 | 1.3 | 0.4 | setosa |
| 17 | 5.1 | 3.5 | 1.4 | 0.3 | setosa |
| 18 | 5.7 | 3.8 | 1.7 | 0.3 | setosa |
| 19 | 5.1 | 3.8 | 1.5 | 0.3 | setosa |
| 20 | 5.4 | 3.4 | 1.7 | 0.2 | setosa |
| 21 | 5.1 | 3.7 | 1.5 | 0.4 | setosa |
| 22 | 4.6 | 3.6 | 1.0 | 0.2 | setosa |
| 23 | 5.1 | 3.3 | 1.7 | 0.5 | setosa |
| 24 | 4.8 | 3.4 | 1.9 | 0.2 | setosa |
| 25 | 5.0 | 3.0 | 1.6 | 0.2 | setosa |
| 26 | 5.0 | 3.4 | 1.6 | 0.4 | setosa |
| 27 | 5.2 | 3.5 | 1.5 | 0.2 | setosa |
| 28 | 5.2 | 3.4 | 1.4 | 0.2 | setosa |
| 29 | 4.7 | 3.2 | 1.6 | 0.2 | setosa |
| 30 | 4.8 | 3.1 | 1.6 | 0.2 | setosa |
| 31 | 5.4 | 3.4 | 1.5 | 0.4 | setosa |
| 32 | 5.2 | 4.1 | 1.5 | 0.1 | setosa |
| 33 | 5.5 | 4.2 | 1.4 | 0.2 | setosa |
| 34 | 4.9 | 3.1 | 1.5 | 0.2 | setosa |
| 35 | 5.0 | 3.2 | 1.2 | 0.2 | setosa |
| 36 | 5.5 | 3.5 | 1.3 | 0.2 | setosa |
| 37 | 4.9 | 3.6 | 1.4 | 0.1 | setosa |
| 38 | 4.4 | 3.0 | 1.3 | 0.2 | setosa |
| 39 | 5.1 | 3.4 | 1.5 | 0.2 | setosa |
| 40 | 5.0 | 3.5 | 1.3 | 0.3 | setosa |
| 41 | 4.5 | 2.3 | 1.3 | 0.3 | setosa |
| 42 | 4.4 | 3.2 | 1.3 | 0.2 | setosa |
| 43 | 5.0 | 3.5 | 1.6 | 0.6 | setosa |
| 44 | 5.1 | 3.8 | 1.9 | 0.4 | setosa |
| 45 | 4.8 | 3.0 | 1.4 | 0.3 | setosa |
| 46 | 5.1 | 3.8 | 1.6 | 0.2 | setosa |
| 47 | 4.6 | 3.2 | 1.4 | 0.2 | setosa |
| 48 | 5.3 | 3.7 | 1.5 | 0.2 | setosa |
| 49 | 5.0 | 3.3 | 1.4 | 0.2 | setosa |
key - versicolor
| sepal_length | sepal_width | petal_length | petal_width | species | |
|---|---|---|---|---|---|
| 50 | 7.0 | 3.2 | 4.7 | 1.4 | versicolor |
| 51 | 6.4 | 3.2 | 4.5 | 1.5 | versicolor |
| 52 | 6.9 | 3.1 | 4.9 | 1.5 | versicolor |
| 53 | 5.5 | 2.3 | 4.0 | 1.3 | versicolor |
| 54 | 6.5 | 2.8 | 4.6 | 1.5 | versicolor |
| 55 | 5.7 | 2.8 | 4.5 | 1.3 | versicolor |
| 56 | 6.3 | 3.3 | 4.7 | 1.6 | versicolor |
| 57 | 4.9 | 2.4 | 3.3 | 1.0 | versicolor |
| 58 | 6.6 | 2.9 | 4.6 | 1.3 | versicolor |
| 59 | 5.2 | 2.7 | 3.9 | 1.4 | versicolor |
| 60 | 5.0 | 2.0 | 3.5 | 1.0 | versicolor |
| 61 | 5.9 | 3.0 | 4.2 | 1.5 | versicolor |
| 62 | 6.0 | 2.2 | 4.0 | 1.0 | versicolor |
| 63 | 6.1 | 2.9 | 4.7 | 1.4 | versicolor |
| 64 | 5.6 | 2.9 | 3.6 | 1.3 | versicolor |
| 65 | 6.7 | 3.1 | 4.4 | 1.4 | versicolor |
| 66 | 5.6 | 3.0 | 4.5 | 1.5 | versicolor |
| 67 | 5.8 | 2.7 | 4.1 | 1.0 | versicolor |
| 68 | 6.2 | 2.2 | 4.5 | 1.5 | versicolor |
| 69 | 5.6 | 2.5 | 3.9 | 1.1 | versicolor |
| 70 | 5.9 | 3.2 | 4.8 | 1.8 | versicolor |
| 71 | 6.1 | 2.8 | 4.0 | 1.3 | versicolor |
| 72 | 6.3 | 2.5 | 4.9 | 1.5 | versicolor |
| 73 | 6.1 | 2.8 | 4.7 | 1.2 | versicolor |
| 74 | 6.4 | 2.9 | 4.3 | 1.3 | versicolor |
| 75 | 6.6 | 3.0 | 4.4 | 1.4 | versicolor |
| 76 | 6.8 | 2.8 | 4.8 | 1.4 | versicolor |
| 77 | 6.7 | 3.0 | 5.0 | 1.7 | versicolor |
| 78 | 6.0 | 2.9 | 4.5 | 1.5 | versicolor |
| 79 | 5.7 | 2.6 | 3.5 | 1.0 | versicolor |
| 80 | 5.5 | 2.4 | 3.8 | 1.1 | versicolor |
| 81 | 5.5 | 2.4 | 3.7 | 1.0 | versicolor |
| 82 | 5.8 | 2.7 | 3.9 | 1.2 | versicolor |
| 83 | 6.0 | 2.7 | 5.1 | 1.6 | versicolor |
| 84 | 5.4 | 3.0 | 4.5 | 1.5 | versicolor |
| 85 | 6.0 | 3.4 | 4.5 | 1.6 | versicolor |
| 86 | 6.7 | 3.1 | 4.7 | 1.5 | versicolor |
| 87 | 6.3 | 2.3 | 4.4 | 1.3 | versicolor |
| 88 | 5.6 | 3.0 | 4.1 | 1.3 | versicolor |
| 89 | 5.5 | 2.5 | 4.0 | 1.3 | versicolor |
| 90 | 5.5 | 2.6 | 4.4 | 1.2 | versicolor |
| 91 | 6.1 | 3.0 | 4.6 | 1.4 | versicolor |
| 92 | 5.8 | 2.6 | 4.0 | 1.2 | versicolor |
| 93 | 5.0 | 2.3 | 3.3 | 1.0 | versicolor |
| 94 | 5.6 | 2.7 | 4.2 | 1.3 | versicolor |
| 95 | 5.7 | 3.0 | 4.2 | 1.2 | versicolor |
| 96 | 5.7 | 2.9 | 4.2 | 1.3 | versicolor |
| 97 | 6.2 | 2.9 | 4.3 | 1.3 | versicolor |
| 98 | 5.1 | 2.5 | 3.0 | 1.1 | versicolor |
| 99 | 5.7 | 2.8 | 4.1 | 1.3 | versicolor |
key - virginica
| sepal_length | sepal_width | petal_length | petal_width | species | |
|---|---|---|---|---|---|
| 100 | 6.3 | 3.3 | 6.0 | 2.5 | virginica |
| 101 | 5.8 | 2.7 | 5.1 | 1.9 | virginica |
| 102 | 7.1 | 3.0 | 5.9 | 2.1 | virginica |
| 103 | 6.3 | 2.9 | 5.6 | 1.8 | virginica |
| 104 | 6.5 | 3.0 | 5.8 | 2.2 | virginica |
| 105 | 7.6 | 3.0 | 6.6 | 2.1 | virginica |
| 106 | 4.9 | 2.5 | 4.5 | 1.7 | virginica |
| 107 | 7.3 | 2.9 | 6.3 | 1.8 | virginica |
| 108 | 6.7 | 2.5 | 5.8 | 1.8 | virginica |
| 109 | 7.2 | 3.6 | 6.1 | 2.5 | virginica |
| 110 | 6.5 | 3.2 | 5.1 | 2.0 | virginica |
| 111 | 6.4 | 2.7 | 5.3 | 1.9 | virginica |
| 112 | 6.8 | 3.0 | 5.5 | 2.1 | virginica |
| 113 | 5.7 | 2.5 | 5.0 | 2.0 | virginica |
| 114 | 5.8 | 2.8 | 5.1 | 2.4 | virginica |
| 115 | 6.4 | 3.2 | 5.3 | 2.3 | virginica |
| 116 | 6.5 | 3.0 | 5.5 | 1.8 | virginica |
| 117 | 7.7 | 3.8 | 6.7 | 2.2 | virginica |
| 118 | 7.7 | 2.6 | 6.9 | 2.3 | virginica |
| 119 | 6.0 | 2.2 | 5.0 | 1.5 | virginica |
| 120 | 6.9 | 3.2 | 5.7 | 2.3 | virginica |
| 121 | 5.6 | 2.8 | 4.9 | 2.0 | virginica |
| 122 | 7.7 | 2.8 | 6.7 | 2.0 | virginica |
| 123 | 6.3 | 2.7 | 4.9 | 1.8 | virginica |
| 124 | 6.7 | 3.3 | 5.7 | 2.1 | virginica |
| 125 | 7.2 | 3.2 | 6.0 | 1.8 | virginica |
| 126 | 6.2 | 2.8 | 4.8 | 1.8 | virginica |
| 127 | 6.1 | 3.0 | 4.9 | 1.8 | virginica |
| 128 | 6.4 | 2.8 | 5.6 | 2.1 | virginica |
| 129 | 7.2 | 3.0 | 5.8 | 1.6 | virginica |
| 130 | 7.4 | 2.8 | 6.1 | 1.9 | virginica |
| 131 | 7.9 | 3.8 | 6.4 | 2.0 | virginica |
| 132 | 6.4 | 2.8 | 5.6 | 2.2 | virginica |
| 133 | 6.3 | 2.8 | 5.1 | 1.5 | virginica |
| 134 | 6.1 | 2.6 | 5.6 | 1.4 | virginica |
| 135 | 7.7 | 3.0 | 6.1 | 2.3 | virginica |
| 136 | 6.3 | 3.4 | 5.6 | 2.4 | virginica |
| 137 | 6.4 | 3.1 | 5.5 | 1.8 | virginica |
| 138 | 6.0 | 3.0 | 4.8 | 1.8 | virginica |
| 139 | 6.9 | 3.1 | 5.4 | 2.1 | virginica |
| 140 | 6.7 | 3.1 | 5.6 | 2.4 | virginica |
| 141 | 6.9 | 3.1 | 5.1 | 2.3 | virginica |
| 142 | 5.8 | 2.7 | 5.1 | 1.9 | virginica |
| 143 | 6.8 | 3.2 | 5.9 | 2.3 | virginica |
| 144 | 6.7 | 3.3 | 5.7 | 2.5 | virginica |
| 145 | 6.7 | 3.0 | 5.2 | 2.3 | virginica |
| 146 | 6.3 | 2.5 | 5.0 | 1.9 | virginica |
| 147 | 6.5 | 3.0 | 5.2 | 2.0 | virginica |
| 148 | 6.2 | 3.4 | 5.4 | 2.3 | virginica |
| 149 | 5.9 | 3.0 | 5.1 | 1.8 | virginica |
In [297]:
irisFrm.sort_values(by='petal_length', ascending = False).groupby('species').get_group('setosa')
Out[297]:
| sepal_length | sepal_width | petal_length | petal_width | species | |
|---|---|---|---|---|---|
| 24 | 4.8 | 3.4 | 1.9 | 0.2 | setosa |
| 44 | 5.1 | 3.8 | 1.9 | 0.4 | setosa |
| 18 | 5.7 | 3.8 | 1.7 | 0.3 | setosa |
| 5 | 5.4 | 3.9 | 1.7 | 0.4 | setosa |
| 23 | 5.1 | 3.3 | 1.7 | 0.5 | setosa |
| 20 | 5.4 | 3.4 | 1.7 | 0.2 | setosa |
| 30 | 4.8 | 3.1 | 1.6 | 0.2 | setosa |
| 46 | 5.1 | 3.8 | 1.6 | 0.2 | setosa |
| 11 | 4.8 | 3.4 | 1.6 | 0.2 | setosa |
| 25 | 5.0 | 3.0 | 1.6 | 0.2 | setosa |
| 26 | 5.0 | 3.4 | 1.6 | 0.4 | setosa |
| 29 | 4.7 | 3.2 | 1.6 | 0.2 | setosa |
| 43 | 5.0 | 3.5 | 1.6 | 0.6 | setosa |
| 7 | 5.0 | 3.4 | 1.5 | 0.2 | setosa |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
| 10 | 5.4 | 3.7 | 1.5 | 0.2 | setosa |
| 15 | 5.7 | 4.4 | 1.5 | 0.4 | setosa |
| 21 | 5.1 | 3.7 | 1.5 | 0.4 | setosa |
| 34 | 4.9 | 3.1 | 1.5 | 0.2 | setosa |
| 48 | 5.3 | 3.7 | 1.5 | 0.2 | setosa |
| 39 | 5.1 | 3.4 | 1.5 | 0.2 | setosa |
| 19 | 5.1 | 3.8 | 1.5 | 0.3 | setosa |
| 9 | 4.9 | 3.1 | 1.5 | 0.1 | setosa |
| 31 | 5.4 | 3.4 | 1.5 | 0.4 | setosa |
| 27 | 5.2 | 3.5 | 1.5 | 0.2 | setosa |
| 32 | 5.2 | 4.1 | 1.5 | 0.1 | setosa |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
| 28 | 5.2 | 3.4 | 1.4 | 0.2 | setosa |
| 8 | 4.4 | 2.9 | 1.4 | 0.2 | setosa |
| 6 | 4.6 | 3.4 | 1.4 | 0.3 | setosa |
| 12 | 4.8 | 3.0 | 1.4 | 0.1 | setosa |
| 17 | 5.1 | 3.5 | 1.4 | 0.3 | setosa |
| 47 | 4.6 | 3.2 | 1.4 | 0.2 | setosa |
| 33 | 5.5 | 4.2 | 1.4 | 0.2 | setosa |
| 49 | 5.0 | 3.3 | 1.4 | 0.2 | setosa |
| 45 | 4.8 | 3.0 | 1.4 | 0.3 | setosa |
| 37 | 4.9 | 3.6 | 1.4 | 0.1 | setosa |
| 36 | 5.5 | 3.5 | 1.3 | 0.2 | setosa |
| 16 | 5.4 | 3.9 | 1.3 | 0.4 | setosa |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
| 38 | 4.4 | 3.0 | 1.3 | 0.2 | setosa |
| 42 | 4.4 | 3.2 | 1.3 | 0.2 | setosa |
| 40 | 5.0 | 3.5 | 1.3 | 0.3 | setosa |
| 41 | 4.5 | 2.3 | 1.3 | 0.3 | setosa |
| 35 | 5.0 | 3.2 | 1.2 | 0.2 | setosa |
| 14 | 5.8 | 4.0 | 1.2 | 0.2 | setosa |
| 13 | 4.3 | 3.0 | 1.1 | 0.1 | setosa |
| 22 | 4.6 | 3.6 | 1.0 | 0.2 | setosa |
In [298]:
titanicFrm.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 survived 891 non-null int64 1 pclass 891 non-null int64 2 sex 891 non-null object 3 age 714 non-null float64 4 sibsp 891 non-null int64 5 parch 891 non-null int64 6 fare 891 non-null float64 7 embarked 889 non-null object 8 class 891 non-null category 9 who 891 non-null object 10 adult_male 891 non-null bool 11 deck 203 non-null category 12 embark_town 889 non-null object 13 alive 891 non-null object 14 alone 891 non-null bool dtypes: bool(2), category(2), float64(2), int64(4), object(5) memory usage: 80.7+ KB
In [301]:
print('age 결측값 - ')
print(titanicFrm['age'].isnull().sum())
age 결측값 - 177
In [302]:
titanicFrm.groupby('sex')['age'].mean()
Out[302]:
sex female 27.915709 male 30.726645 Name: age, dtype: float64
In [304]:
titanicFrm.groupby('sex')['age'].groups
Out[304]:
{'female': [1, 2, 3, 8, 9, 10, 11, 14, 15, 18, 19, 22, 24, 25, 28, 31, 32, 38, 39, 40, 41, 43, 44, 47, 49, 52, 53, 56, 58, 61, 66, 68, 71, 79, 82, 84, 85, 88, 98, 100, 106, 109, 111, 113, 114, 119, 123, 128, 132, 133, 136, 140, 141, 142, 147, 151, 156, 161, 166, 167, 172, 177, 180, 184, 186, 190, 192, 194, 195, 198, 199, 205, 208, 211, 215, 216, 218, 229, 230, 233, 235, 237, 240, 241, 246, 247, 251, 254, 255, 256, 257, 258, 259, 264, 268, 269, 272, 274, 275, 276, ...], 'male': [0, 4, 5, 6, 7, 12, 13, 16, 17, 20, 21, 23, 26, 27, 29, 30, 33, 34, 35, 36, 37, 42, 45, 46, 48, 50, 51, 54, 55, 57, 59, 60, 62, 63, 64, 65, 67, 69, 70, 72, 73, 74, 75, 76, 77, 78, 80, 81, 83, 86, 87, 89, 90, 91, 92, 93, 94, 95, 96, 97, 99, 101, 102, 103, 104, 105, 107, 108, 110, 112, 115, 116, 117, 118, 120, 121, 122, 124, 125, 126, 127, 129, 130, 131, 134, 135, 137, 138, 139, 143, 144, 145, 146, 148, 149, 150, 152, 153, 154, 155, ...]}
In [315]:
tmp = titanicFrm.groupby('sex')['age'].apply(lambda x : x.fillna(x.mean()))
In [316]:
titanicFrm['age'] = tmp.values
print('age 결측값 - ')
print(titanicFrm['age'].isnull().sum())
age 결측값 - 0
In [319]:
titanicFrm
Out[319]:
| survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | male | 38.0 | 1 | 0 | 7.2500 | S | Third | man | True | NaN | Southampton | no | False |
| 1 | 1 | 1 | female | 26.0 | 1 | 0 | 71.2833 | C | First | woman | False | C | Cherbourg | yes | False |
| 2 | 1 | 3 | female | 35.0 | 0 | 0 | 7.9250 | S | Third | woman | False | NaN | Southampton | yes | True |
| 3 | 1 | 1 | female | 27.0 | 1 | 0 | 53.1000 | S | First | woman | False | C | Southampton | yes | False |
| 4 | 0 | 3 | male | 14.0 | 0 | 0 | 8.0500 | S | Third | man | True | NaN | Southampton | no | True |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 0 | 2 | male | 28.0 | 0 | 0 | 13.0000 | S | Second | man | True | NaN | Southampton | no | True |
| 887 | 1 | 1 | female | 25.0 | 0 | 0 | 30.0000 | S | First | woman | False | B | Southampton | yes | True |
| 888 | 0 | 3 | female | 27.0 | 1 | 2 | 23.4500 | S | Third | woman | False | NaN | Southampton | no | False |
| 889 | 1 | 1 | male | 26.0 | 0 | 0 | 30.0000 | C | First | man | True | C | Cherbourg | yes | True |
| 890 | 0 | 3 | male | 32.0 | 0 | 0 | 7.7500 | Q | Third | man | True | NaN | Queenstown | no | True |
891 rows × 15 columns
In [ ]:
In [ ]:
In [ ]:
In [ ]: