학습목표¶

  • 데이터 조작(잘 꺼내야된다!!)
  • 프레임은 기본 열 인덱스를 사용하고, 행 인덱스를 위해서는 슬라이싱
  • loc[] 라벨값, iloc[] 정수값 : 인덱서
In [1]:
import numpy  as np 
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt 
import json   

# warning 제거
import warnings
warnings.filterwarnings('ignore')

# version check 
print('numpy  version - ' , np.__version__)
print('pandas version - ' , pd.__version__)

# 데이터 정보 출력 함수 
def aryInfo(ary) : 
    print('type - ' , type(ary)) 
    print('shape - ' , ary.shape)
    print('ndim  - ' , ary.ndim)
    print('dtype - ' , ary.dtype)
    print()
    print('data  -')
    print(ary)

def seriesInfo(s) :
    print('type   - ' , type(s)) 
    print('index  - ' , s.index)
    print('values - ' , s.values)
    print('dtype  - ' , s.dtype)
    print()
    print('data   - ')
    print(s)

def frmInfo(frm) :
    print('type    - ' , type(frm))
    print('shape   - ' , frm.shape)
    print('ndim    - ' , frm.ndim)
    print('row idx - ' , frm.index , type(frm.index))
    print('col idx - ' , frm.columns , type(frm.columns))
    print('values  - ' , type(frm.values))
    print(frm.values)
    print('data - ') 
    print(frm)
numpy  version -  2.1.3
pandas version -  2.2.3
In [51]:
# 프레임은 데이터가 있는 열의 집합
scores = {
    'kor' : [90,85,100,88,78],
    'eng' : [90,85,100,88,78],
    'mat' : [90,85,100,88,78]
} # 프레임으로 만들면 행이 5개 열이 3개 생성

frm = pd.DataFrame(scores,
                  index = ['강승우', '최호준', '임정섭', '이현우', '오신호'])

frmInfo(frm)
type    -  <class 'pandas.core.frame.DataFrame'>
shape   -  (5, 3)
ndim    -  2
row idx -  Index(['강승우', '최호준', '임정섭', '이현우', '오신호'], dtype='object') <class 'pandas.core.indexes.base.Index'>
col idx -  Index(['kor', 'eng', 'mat'], dtype='object') <class 'pandas.core.indexes.base.Index'>
values  -  <class 'numpy.ndarray'>
[[ 90  90  90]
 [ 85  85  85]
 [100 100 100]
 [ 88  88  88]
 [ 78  78  78]]
data - 
     kor  eng  mat
강승우   90   90   90
최호준   85   85   85
임정섭  100  100  100
이현우   88   88   88
오신호   78   78   78
In [52]:
frm
Out[52]:
kor eng mat
강승우 90 90 90
최호준 85 85 85
임정섭 100 100 100
이현우 88 88 88
오신호 78 78 78
In [53]:
# Quiz
# 모든 학생의 과목평균 점수를 새로운 열('mean') 추가하고 싶다면?
# axis = 0 (열), axis = 1 (행)
frm.values
Out[53]:
array([[ 90,  90,  90],
       [ 85,  85,  85],
       [100, 100, 100],
       [ 88,  88,  88],
       [ 78,  78,  78]])
In [69]:
frm['mean'] = np.mean(frm.values, axis=1).astype(np.int32)
frm
Out[69]:
kor eng mat mean
강승우 90 90 90 90
최호준 85 90 85 86
임정섭 100 100 100 100
이현우 88 88 88 88
오신호 78 78 78 78
In [70]:
# Quiz
# 최호준 학생의 영어점수를 90점으로 수정하고 평균 점수도 다시 계산
# iloc[], loc[] 행 인덱싱에서 사용하는 함수

frm.loc['최호준', 'eng'] = 90
frm['mean'] = np.mean(frm.values, axis=1).astype(np.int32)
frm
Out[70]:
kor eng mat mean
강승우 90 90 90 90
최호준 85 90 85 86
임정섭 100 100 100 100
이현우 88 88 88 88
오신호 78 78 78 78
In [78]:
# 시리즈
lee = frm.loc['이현우']
print(lee)
print('type - ', type(lee))
kor     88
eng     88
mat     88
mean    88
Name: 이현우, dtype: int64
type -  <class 'pandas.core.series.Series'>
In [79]:
# 데이터 프레임
lee = frm.loc[['이현우']]
print(lee)
print('type - ', type(lee))
     kor  eng  mat  mean
이현우   88   88   88    88
type -  <class 'pandas.core.frame.DataFrame'>
In [84]:
lim  = frm.loc['임정섭', 'kor' : 'eng']
print(lim)
print('type - ', type(lim))
kor    100
eng    100
Name: 임정섭, dtype: int64
type -  <class 'pandas.core.series.Series'>
  • titanic dataset(seaborn)
In [135]:
titanicRawData = sns.load_dataset('titanic')
print('type - ', type(titanicRawData))
titanicRawData.head()
type -  <class 'pandas.core.frame.DataFrame'>
Out[135]:
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
0 0 3 male 22.0 1 0 7.2500 S Third man True NaN Southampton no False
1 1 1 female 38.0 1 0 71.2833 C First woman False C Cherbourg yes False
2 1 3 female 26.0 0 0 7.9250 S Third woman False NaN Southampton yes True
3 1 1 female 35.0 1 0 53.1000 S First woman False C Southampton yes False
4 0 3 male 35.0 0 0 8.0500 S Third man True NaN Southampton no True
In [107]:
titanicRawData.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
 15  age_by_10    891 non-null    int64   
dtypes: bool(2), category(2), float64(2), int64(5), object(5)
memory usage: 87.6+ KB
In [96]:
# Quiz
# 선실등급(pclass)의 인원수를 확인하고 싶다면?

print('유일값 확인 - unique()')
print(titanicRawData['pclass'].unique())
print(titanicRawData['pclass'].value_counts().values)
유일값 확인 - unique()
[3 1 2]
[491 216 184]
In [99]:
# 데이터 프레임의 컬럼명 확인
print('type - ', type(titanicRawData.columns))
print(titanicRawData.columns)
type -  <class 'pandas.core.indexes.base.Index'>
Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')
In [108]:
# Quiz
# 기존나이에 10살을 더해서 age_by_10 열을 추가하고 싶다면?


titanicRawData['age_by_10'] = (titanicRawData['age'].values + 10).astype('int')
titanicRawData.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
 15  age_by_10    891 non-null    int64   
dtypes: bool(2), category(2), float64(2), int64(5), object(5)
memory usage: 87.6+ KB
In [109]:
# age_by_10 컬럼을 삭제하고 싶다면?
# drop( , axis = 1, inplace = True)
titanicRawData.drop(['age_by_10'], axis = 1 , inplace = True)
In [131]:
titanicRawData.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB
In [132]:
# 요금(fare)에 대한 통계(최대, 최소, 평균, 합계) 확인이 필요하다면?
print('fare max - ', np.max(titanicRawData['fare'].values))
print('fare min - ', np.min(titanicRawData['fare'].values))
print('fare mean - ', np.mean(titanicRawData['fare'].values))
fare max -  512.3292
fare min -  0.0
fare mean -  32.204207968574636
In [136]:
# Quiz
# 선실등급(pclass)이 3등급인 데이터만의 subset 만들고 싶다면?
pclassSubsetFrm = titanicRawData[titanicRawData['pclass'] == 3]
print('type - ', type(pclassSubsetFrm))
print(pclassSubsetFrm)
type -  <class 'pandas.core.frame.DataFrame'>
     survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0           0       3    male  22.0      1      0   7.2500        S  Third   
2           1       3  female  26.0      0      0   7.9250        S  Third   
4           0       3    male  35.0      0      0   8.0500        S  Third   
5           0       3    male   NaN      0      0   8.4583        Q  Third   
7           0       3    male   2.0      3      1  21.0750        S  Third   
..        ...     ...     ...   ...    ...    ...      ...      ...    ...   
882         0       3  female  22.0      0      0  10.5167        S  Third   
884         0       3    male  25.0      0      0   7.0500        S  Third   
885         0       3  female  39.0      0      5  29.1250        Q  Third   
888         0       3  female   NaN      1      2  23.4500        S  Third   
890         0       3    male  32.0      0      0   7.7500        Q  Third   

       who  adult_male deck  embark_town alive  alone  
0      man        True  NaN  Southampton    no  False  
2    woman       False  NaN  Southampton   yes   True  
4      man        True  NaN  Southampton    no   True  
5      man        True  NaN   Queenstown    no   True  
7    child       False  NaN  Southampton    no  False  
..     ...         ...  ...          ...   ...    ...  
882  woman       False  NaN  Southampton    no   True  
884    man        True  NaN  Southampton    no   True  
885  woman       False  NaN   Queenstown    no  False  
888  woman       False  NaN  Southampton    no  False  
890    man        True  NaN   Queenstown    no   True  

[491 rows x 15 columns]
In [144]:
# Quiz
# 위 서브셋에서 성별과(sex), 생존여부(survived)만 가지는 새로운 서브셋을 만들고 싶다면?
newsubset = pclassSubsetFrm[['sex','survived']]
print(newsubset.head())
      sex  survived
0    male         0
1  female         1
2    male         0
3    male         0
4    male         0
In [151]:
# 인덱스 재조정
# reset_index()
pclassSubsetFrm.reset_index(inplace = True)
print(pclassSubsetFrm.head())
   index  survived  pclass     sex   age  sibsp  parch     fare embarked  \
0      0         0       3    male  22.0      1      0   7.2500        S   
1      1         1       3  female  26.0      0      0   7.9250        S   
2      2         0       3    male  35.0      0      0   8.0500        S   
3      3         0       3    male   NaN      0      0   8.4583        Q   
4      4         0       3    male   2.0      3      1  21.0750        S   

   class    who  adult_male deck  embark_town alive  alone  
0  Third    man        True  NaN  Southampton    no  False  
1  Third  woman       False  NaN  Southampton   yes   True  
2  Third    man        True  NaN  Southampton    no   True  
3  Third    man        True  NaN   Queenstown    no   True  
4  Third  child       False  NaN  Southampton    no  False  
In [152]:
print(pclassSubsetFrm)
     index  survived  pclass     sex   age  sibsp  parch     fare embarked  \
0        0         0       3    male  22.0      1      0   7.2500        S   
1        1         1       3  female  26.0      0      0   7.9250        S   
2        2         0       3    male  35.0      0      0   8.0500        S   
3        3         0       3    male   NaN      0      0   8.4583        Q   
4        4         0       3    male   2.0      3      1  21.0750        S   
..     ...       ...     ...     ...   ...    ...    ...      ...      ...   
486    486         0       3  female  22.0      0      0  10.5167        S   
487    487         0       3    male  25.0      0      0   7.0500        S   
488    488         0       3  female  39.0      0      5  29.1250        Q   
489    489         0       3  female   NaN      1      2  23.4500        S   
490    490         0       3    male  32.0      0      0   7.7500        Q   

     class    who  adult_male deck  embark_town alive  alone  
0    Third    man        True  NaN  Southampton    no  False  
1    Third  woman       False  NaN  Southampton   yes   True  
2    Third    man        True  NaN  Southampton    no   True  
3    Third    man        True  NaN   Queenstown    no   True  
4    Third  child       False  NaN  Southampton    no  False  
..     ...    ...         ...  ...          ...   ...    ...  
486  Third  woman       False  NaN  Southampton    no   True  
487  Third    man        True  NaN  Southampton    no   True  
488  Third  woman       False  NaN   Queenstown    no  False  
489  Third  woman       False  NaN  Southampton    no  False  
490  Third    man        True  NaN   Queenstown    no   True  

[491 rows x 16 columns]
In [153]:
pclassSubsetFrm.drop('index', axis = 1 , inplace =True)
pclassSubsetFrm
Out[153]:
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
0 0 3 male 22.0 1 0 7.2500 S Third man True NaN Southampton no False
1 1 3 female 26.0 0 0 7.9250 S Third woman False NaN Southampton yes True
2 0 3 male 35.0 0 0 8.0500 S Third man True NaN Southampton no True
3 0 3 male NaN 0 0 8.4583 Q Third man True NaN Queenstown no True
4 0 3 male 2.0 3 1 21.0750 S Third child False NaN Southampton no False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
486 0 3 female 22.0 0 0 10.5167 S Third woman False NaN Southampton no True
487 0 3 male 25.0 0 0 7.0500 S Third man True NaN Southampton no True
488 0 3 female 39.0 0 5 29.1250 Q Third woman False NaN Queenstown no False
489 0 3 female NaN 1 2 23.4500 S Third woman False NaN Southampton no False
490 0 3 male 32.0 0 0 7.7500 Q Third man True NaN Queenstown no True

491 rows × 15 columns

In [154]:
# set_index : 특정 컬럼을 인덱스로 변경하는 함수

pclassSubsetFrm.reset_index(inplace=True)
pclassSubsetFrm
Out[154]:
index survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
0 0 0 3 male 22.0 1 0 7.2500 S Third man True NaN Southampton no False
1 1 1 3 female 26.0 0 0 7.9250 S Third woman False NaN Southampton yes True
2 2 0 3 male 35.0 0 0 8.0500 S Third man True NaN Southampton no True
3 3 0 3 male NaN 0 0 8.4583 Q Third man True NaN Queenstown no True
4 4 0 3 male 2.0 3 1 21.0750 S Third child False NaN Southampton no False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
486 486 0 3 female 22.0 0 0 10.5167 S Third woman False NaN Southampton no True
487 487 0 3 male 25.0 0 0 7.0500 S Third man True NaN Southampton no True
488 488 0 3 female 39.0 0 5 29.1250 Q Third woman False NaN Queenstown no False
489 489 0 3 female NaN 1 2 23.4500 S Third woman False NaN Southampton no False
490 490 0 3 male 32.0 0 0 7.7500 Q Third man True NaN Queenstown no True

491 rows × 16 columns

In [155]:
pclassSubsetFrm.set_index('index', inplace=True)
pclassSubsetFrm
Out[155]:
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
index
0 0 3 male 22.0 1 0 7.2500 S Third man True NaN Southampton no False
1 1 3 female 26.0 0 0 7.9250 S Third woman False NaN Southampton yes True
2 0 3 male 35.0 0 0 8.0500 S Third man True NaN Southampton no True
3 0 3 male NaN 0 0 8.4583 Q Third man True NaN Queenstown no True
4 0 3 male 2.0 3 1 21.0750 S Third child False NaN Southampton no False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
486 0 3 female 22.0 0 0 10.5167 S Third woman False NaN Southampton no True
487 0 3 male 25.0 0 0 7.0500 S Third man True NaN Southampton no True
488 0 3 female 39.0 0 5 29.1250 Q Third woman False NaN Queenstown no False
489 0 3 female NaN 1 2 23.4500 S Third woman False NaN Southampton no False
490 0 3 male 32.0 0 0 7.7500 Q Third man True NaN Queenstown no True

491 rows × 15 columns

In [161]:
# Quiz
# 원본 데이터로부터 나이가 60이상이고 선실등급이 1등급이면서 성별이 여자인 데이터만 추출하여 서브셋을 만들고 싶다면?

sub = titanicRawData[(titanicRawData['age'] >= 60) & (titanicRawData['pclass'] == 1) & (titanicRawData['sex'] == 'female')]
print(sub)
     survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
275         1       1  female  63.0      1      0  77.9583        S  First   
366         1       1  female  60.0      1      0  75.2500        C  First   
829         1       1  female  62.0      0      0  80.0000      NaN  First   

       who  adult_male deck  embark_town alive  alone  
275  woman       False    D  Southampton   yes  False  
366  woman       False    D    Cherbourg   yes  False  
829  woman       False    B          NaN   yes   True  
In [162]:
sub = titanicRawData[(titanicRawData['age'] >= 60) & (titanicRawData['pclass'] == 1) & (titanicRawData['sex'] == 'male')]
print(sub)
     survived  pclass   sex   age  sibsp  parch      fare embarked  class  \
54          0       1  male  65.0      0      1   61.9792        C  First   
96          0       1  male  71.0      0      0   34.6542        C  First   
170         0       1  male  61.0      0      0   33.5000        S  First   
252         0       1  male  62.0      0      0   26.5500        S  First   
438         0       1  male  64.0      1      4  263.0000        S  First   
456         0       1  male  65.0      0      0   26.5500        S  First   
493         0       1  male  71.0      0      0   49.5042        C  First   
545         0       1  male  64.0      0      0   26.0000        S  First   
555         0       1  male  62.0      0      0   26.5500        S  First   
587         1       1  male  60.0      1      1   79.2000        C  First   
625         0       1  male  61.0      0      0   32.3208        S  First   
630         1       1  male  80.0      0      0   30.0000        S  First   
694         0       1  male  60.0      0      0   26.5500        S  First   
745         0       1  male  70.0      1      1   71.0000        S  First   

     who  adult_male deck  embark_town alive  alone  
54   man        True    B    Cherbourg    no  False  
96   man        True    A    Cherbourg    no   True  
170  man        True    B  Southampton    no   True  
252  man        True    C  Southampton    no   True  
438  man        True    C  Southampton    no  False  
456  man        True    E  Southampton    no   True  
493  man        True  NaN    Cherbourg    no   True  
545  man        True  NaN  Southampton    no   True  
555  man        True  NaN  Southampton    no   True  
587  man        True    B    Cherbourg   yes  False  
625  man        True    D  Southampton    no   True  
630  man        True    A  Southampton   yes   True  
694  man        True  NaN  Southampton    no   True  
745  man        True    B  Southampton    no  False  

sort : 인덱스 기준, 열 값을 기준으로 # ascending = True(오름차순) False(내림차순)¶

  • sort_index(axis = , ascending =)
  • sort_values(by = , ascending =)
In [164]:
# 원본데이터로부터 승객의 나이를 기준으로 내림차순한 서브셋을 만들고 싶다면?
subsetFrm = titanicRawData.sort_values(by='age', ascending=False)
subsetFrm.reset_index(inplace = True)
subsetFrm.drop('index', axis = 1, inplace = True)
subsetFrm
Out[164]:
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
0 1 1 male 80.0 0 0 30.0000 S First man True A Southampton yes True
1 0 3 male 74.0 0 0 7.7750 S Third man True NaN Southampton no True
2 0 1 male 71.0 0 0 49.5042 C First man True NaN Cherbourg no True
3 0 1 male 71.0 0 0 34.6542 C First man True A Cherbourg no True
4 0 3 male 70.5 0 0 7.7500 Q Third man True NaN Queenstown no True
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
886 0 3 male NaN 0 0 7.2292 C Third man True NaN Cherbourg no True
887 0 3 female NaN 8 2 69.5500 S Third woman False NaN Southampton no False
888 0 3 male NaN 0 0 9.5000 S Third man True NaN Southampton no True
889 0 3 male NaN 0 0 7.8958 S Third man True NaN Southampton no True
890 0 3 female NaN 1 2 23.4500 S Third woman False NaN Southampton no False

891 rows × 15 columns

In [168]:
print('성별에 따른 승객수를 시각화하기 위해서 정렬을 한다면 - ')
titanicRawData['sex'].value_counts().sort_values(ascending = False)
성별에 따른 승객수를 시각화하기 위해서 정렬을 한다면 - 
Out[168]:
sex
male      577
female    314
Name: count, dtype: int64
In [169]:
titanicRawData.sort_index(ascending= False)
Out[169]:
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
890 0 3 male 32.0 0 0 7.7500 Q Third man True NaN Queenstown no True
889 1 1 male 26.0 0 0 30.0000 C First man True C Cherbourg yes True
888 0 3 female NaN 1 2 23.4500 S Third woman False NaN Southampton no False
887 1 1 female 19.0 0 0 30.0000 S First woman False B Southampton yes True
886 0 2 male 27.0 0 0 13.0000 S Second man True NaN Southampton no True
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
4 0 3 male 35.0 0 0 8.0500 S Third man True NaN Southampton no True
3 1 1 female 35.0 1 0 53.1000 S First woman False C Southampton yes False
2 1 3 female 26.0 0 0 7.9250 S Third woman False NaN Southampton yes True
1 1 1 female 38.0 1 0 71.2833 C First woman False C Cherbourg yes False
0 0 3 male 22.0 1 0 7.2500 S Third man True NaN Southampton no False

891 rows × 15 columns

  • Pandas
  • csv 파일 입출력과 데이터 조작 (excel, json, html, scraping 비정형 데이터)
  • read_xxxxx() , to_xxxxx()
In [170]:
namesFrm = pd.read_csv('./data/year2022_baby_name.csv',
                      sep=',',
                      encoding='utf-8')
print('type - ', type(namesFrm))
type -  <class 'pandas.core.frame.DataFrame'>
In [172]:
namesFrm.head()
Out[172]:
NAME GENDER COUNT
0 Isabella F 22731
1 Sophia F 20477
2 Emma F 17179
3 Olivia F 16860
4 Ava F 15300
In [173]:
namesFrm.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33838 entries, 0 to 33837
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   NAME    33838 non-null  object
 1   GENDER  33838 non-null  object
 2   COUNT   33838 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 793.2+ KB
In [175]:
# namesFrm.describe() # 숫자데이터 통계정보
namesFrm.columns
Out[175]:
Index(['NAME', 'GENDER', 'COUNT'], dtype='object')
In [177]:
# Quiz
# count 열을 기준으로 내림차순 정렬하여 서브셋을 만든다면?
# 인덱스를 확인하고 reset시킨후
# 불필요한 'index' 열 삭제

subset = namesFrm.sort_values(by='COUNT', ascending=False)
subset.reset_index(inplace = True)
subset.drop('index', axis = 1, inplace = True)
subset
Out[177]:
NAME GENDER COUNT
0 Isabella F 22731
1 Jacob M 21875
2 Sophia F 20477
3 Ethan M 17866
4 Emma F 17179
... ... ... ...
33833 Xaine M 5
33834 Xaveon M 5
33835 Xavious M 5
33836 Xiomar M 5
33837 Xylan M 5

33838 rows × 3 columns

In [191]:
# Quiz
# 열 이름을 변경하고자 하고( NAME -> name, GENDER -> gender , COUNT -> count)
# 성별(gender)이 남자인 데이터를 추출한다면?

# type(subset.columns.values)

# for col in subset.columns.values:
#     print(col.lower())

cols = [ col.lower() for col in subset.columns.values]
subset.columns = cols
print(subset)

# subset.rename(columns = cols, inplace = True) # 리스트 형태라서 안됨 딕셔너리로 만들어줘야함
# subset[subset['gender']=='M']

subset.rename(columns = {col: col.lower() for col in subset.columns}, inplace = True) # 딕셔너리 컴프리헨션
subset[subset['gender']=='M']

# subset.rename(columns={'NAME': 'name', 'GENDER': 'gender', 'COUNT': 'count'}, inplace=True)
# M = subset[subset['gender'] == 'M']
# print(M)
           name gender  count
0      Isabella      F  22731
1         Jacob      M  21875
2        Sophia      F  20477
3         Ethan      M  17866
4          Emma      F  17179
...         ...    ...    ...
33833     Xaine      M      5
33834    Xaveon      M      5
33835   Xavious      M      5
33836    Xiomar      M      5
33837     Xylan      M      5

[33838 rows x 3 columns]
Out[191]:
name gender count
1 Jacob M 21875
3 Ethan M 17866
5 Michael M 17133
6 Jayden M 17030
7 William M 16870
... ... ... ...
33833 Xaine M 5
33834 Xaveon M 5
33835 Xavious M 5
33836 Xiomar M 5
33837 Xylan M 5

14140 rows × 3 columns

통계량 확인¶

  • groupby(열 인덱스 | 인덱스) : 데이터를 그룹으로 분할하여 독립된 그룹에 대해서 별도의 데이터 처리
  • split, apply(lambda), combine, agg([count, sum, mean])
In [198]:
frm = pd.read_csv('./data/service_data_groupby_sample.csv',
                      encoding='cp949')
print('type - ', type(frm))
frm
type -  <class 'pandas.core.frame.DataFrame'>
Out[198]:
id gender height age region
0 1 남자 175 22 서울
1 2 여자 160 23 서울
2 3 여자 161 21 서울
3 4 여자 170 33 서울
4 5 여자 155 35 경기
5 6 남자 181 41 서울
6 7 남자 183 33 충북
7 8 여자 171 22 서울
8 9 남자 188 29 경기
9 10 남자 177 39 인천
10 11 여자 152 40 서울
11 12 남자 180 24 서울
In [194]:
frm.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      12 non-null     int64 
 1   gender  12 non-null     object
 2   height  12 non-null     int64 
 3   age     12 non-null     int64 
 4   region  12 non-null     object
dtypes: int64(3), object(2)
memory usage: 612.0+ bytes
In [203]:
# Quiz
# 지역별 나이평균을 확인하고 싶다면??

# type(frm.groupby('region').get_group('경기'))
# type(frm.groupby('region').get_group('경기')['age'])

frm.groupby('region')['age'].mean()
Out[203]:
region
경기    32.00
서울    28.25
인천    39.00
충북    33.00
Name: age, dtype: float64
In [209]:
# Quiz
# 성별을 기준으로 그룹을 나누고 싶다면 -
tmp = frm.groupby('gender')[['height']].mean()
tmp.reset_index(inplace = True)
tmp
Out[209]:
gender height
0 남자 180.666667
1 여자 161.500000
In [216]:
# 다중통계량 : agg()
subset = frm.drop('region', axis = 1)
subset
Out[216]:
id gender height age
0 1 남자 175 22
1 2 여자 160 23
2 3 여자 161 21
3 4 여자 170 33
4 5 여자 155 35
5 6 남자 181 41
6 7 남자 183 33
7 8 여자 171 22
8 9 남자 188 29
9 10 남자 177 39
10 11 여자 152 40
11 12 남자 180 24
In [215]:
subset.groupby('gender').agg(['mean', 'var', 'std'])
Out[215]:
id height age
mean var std mean var std mean var std
gender
남자 7.5 14.7 3.834058 180.666667 21.066667 4.589844 31.333333 60.266667 7.763161
여자 5.5 11.5 3.391165 161.500000 59.500000 7.713624 29.000000 64.400000 8.024961
In [218]:
frm.groupby('gender')['age'].agg(['max', 'min', 'mean', 'median']).reset_index()
Out[218]:
gender max min mean median
0 남자 41 22 31.333333 31.0
1 여자 40 21 29.000000 28.0
In [223]:
# Quiz
# 성별에 따른 거주지의 최빈값(mode())을 확인하고 싶다면?

# frm.groupby('gender')['region'].agg(lambda x : x.mode())
frm.groupby('gender')['region'].apply(lambda x : x.mode())
Out[223]:
gender   
남자      0    서울
여자      0    서울
Name: region, dtype: object
In [224]:
tipsFrm = sns.load_dataset('tips')
tipsFrm.head()
Out[224]:
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4
In [225]:
tipsFrm.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB
In [226]:
tipsFrm.columns
Out[226]:
Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')
In [228]:
titanicFrm = sns.load_dataset('titanic')
titanicFrm.head()
Out[228]:
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
0 0 3 male 22.0 1 0 7.2500 S Third man True NaN Southampton no False
1 1 1 female 38.0 1 0 71.2833 C First woman False C Cherbourg yes False
2 1 3 female 26.0 0 0 7.9250 S Third woman False NaN Southampton yes True
3 1 1 female 35.0 1 0 53.1000 S First woman False C Southampton yes False
4 0 3 male 35.0 0 0 8.0500 S Third man True NaN Southampton no True
In [234]:
# Quiz
# tipsFrm 전체 평균 팁은 얼마일까요?

tip = tipsFrm['tip'].mean()
print(tip)
2.99827868852459
In [240]:
# Quiz
# 남성과 여성 중 평균 팁이 더 높은 성별을 확인하고 싶다면?

print(tipsFrm.groupby('sex')['tip'].agg('mean').reset_index())

# 흡연자와 비흡연자 중 편균 팁 비율이 높은 그룹은?
# assign
print(tipsFrm.assign(pct = tipsFrm['tip'] / tipsFrm['total_bill']).groupby('smoker')['pct'].mean())
      sex       tip
0    Male  3.089618
1  Female  2.833448
smoker
Yes    0.163196
No     0.159328
Name: pct, dtype: float64
In [250]:
# Quiz
# 팁이 가장 많이 발생하는 요일을 확인하고 싶다면?

result = tipsFrm.groupby('day')['tip'].sum().sort_values(ascending=False).head(1)
print('type - ', type(result))
print(result)
type -  <class 'pandas.core.series.Series'>
day
Sat    260.4
Name: tip, dtype: float64
In [258]:
# Quiz
# ['Dinner', 'Lunch'] 중 평균 팁 비율이 높은 시간대를 확인하고 싶다면?

# tipsFrm['time'].unique()
# tipsFrm.groupby('time')['tip'].mean()

tipsFrm.assign(pct = tipsFrm['tip'] / tipsFrm['total_bill']).groupby('time')['pct'].mean().sort_values(ascending=False).head(1)

tipsFrm['pct'] = tipsFrm['tip'] / tipsFrm['total_bill']
result = tipsFrm.groupby('time')['pct'].mean().sort_values(ascending = False).head(1)
print(result)
time
Lunch    0.164128
Name: pct, dtype: float64
In [268]:
# Quiz : titanicFrm
# subset - (age,sex,class,fare,survived)

# subset = titanicFrm[['age', 'sex', 'class', 'fare', 'survived']]
# subset.head()

subset = titanicFrm.loc[: ,['age', 'sex', 'class', 'fare', 'survived']]
subset.head()
Out[268]:
age sex class fare survived
0 22.0 male Third 7.2500 0
1 38.0 female First 71.2833 1
2 26.0 female Third 7.9250 1
3 35.0 female First 53.1000 1
4 35.0 male Third 8.0500 0
In [285]:
# Quiz 
# 선실등급에 따른 그룹을 만들고 1등급 승객만 데이터 프레임 형식으로 만들어 본다면?

# firstClass = subset[subset['class'] == 'First']
# print(firstClass) 

grp = subset.groupby('class')
# print(grp)
# print(grp.get_group('First'))

result = grp.get_group('First')
print(result)

#subset.loc[grp.groups['First'].values , : ]
      age     sex  class     fare  survived
1    38.0  female  First  71.2833         1
3    35.0  female  First  53.1000         1
6    54.0    male  First  51.8625         0
11   58.0  female  First  26.5500         1
23   28.0    male  First  35.5000         1
..    ...     ...    ...      ...       ...
871  47.0  female  First  52.5542         1
872  33.0    male  First   5.0000         0
879  56.0  female  First  83.1583         1
887  19.0  female  First  30.0000         1
889  26.0    male  First  30.0000         1

[216 rows x 5 columns]
In [286]:
irisFrm = sns.load_dataset('iris')
irisFrm.head()
Out[286]:
sepal_length sepal_width petal_length petal_width species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
In [287]:
irisFrm['species'].value_counts()
Out[287]:
species
setosa        50
versicolor    50
virginica     50
Name: count, dtype: int64
In [288]:
grp = irisFrm.groupby('species')
print(grp)
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000014B9D9DF050>
In [289]:
print(grp.groups)
{'setosa': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49], 'versicolor': [50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99], 'virginica': [100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149]}
In [291]:
for key, group in grp :
    print('key - ', key)
    print()
    display(group)
key -  setosa

sepal_length sepal_width petal_length petal_width species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
5 5.4 3.9 1.7 0.4 setosa
6 4.6 3.4 1.4 0.3 setosa
7 5.0 3.4 1.5 0.2 setosa
8 4.4 2.9 1.4 0.2 setosa
9 4.9 3.1 1.5 0.1 setosa
10 5.4 3.7 1.5 0.2 setosa
11 4.8 3.4 1.6 0.2 setosa
12 4.8 3.0 1.4 0.1 setosa
13 4.3 3.0 1.1 0.1 setosa
14 5.8 4.0 1.2 0.2 setosa
15 5.7 4.4 1.5 0.4 setosa
16 5.4 3.9 1.3 0.4 setosa
17 5.1 3.5 1.4 0.3 setosa
18 5.7 3.8 1.7 0.3 setosa
19 5.1 3.8 1.5 0.3 setosa
20 5.4 3.4 1.7 0.2 setosa
21 5.1 3.7 1.5 0.4 setosa
22 4.6 3.6 1.0 0.2 setosa
23 5.1 3.3 1.7 0.5 setosa
24 4.8 3.4 1.9 0.2 setosa
25 5.0 3.0 1.6 0.2 setosa
26 5.0 3.4 1.6 0.4 setosa
27 5.2 3.5 1.5 0.2 setosa
28 5.2 3.4 1.4 0.2 setosa
29 4.7 3.2 1.6 0.2 setosa
30 4.8 3.1 1.6 0.2 setosa
31 5.4 3.4 1.5 0.4 setosa
32 5.2 4.1 1.5 0.1 setosa
33 5.5 4.2 1.4 0.2 setosa
34 4.9 3.1 1.5 0.2 setosa
35 5.0 3.2 1.2 0.2 setosa
36 5.5 3.5 1.3 0.2 setosa
37 4.9 3.6 1.4 0.1 setosa
38 4.4 3.0 1.3 0.2 setosa
39 5.1 3.4 1.5 0.2 setosa
40 5.0 3.5 1.3 0.3 setosa
41 4.5 2.3 1.3 0.3 setosa
42 4.4 3.2 1.3 0.2 setosa
43 5.0 3.5 1.6 0.6 setosa
44 5.1 3.8 1.9 0.4 setosa
45 4.8 3.0 1.4 0.3 setosa
46 5.1 3.8 1.6 0.2 setosa
47 4.6 3.2 1.4 0.2 setosa
48 5.3 3.7 1.5 0.2 setosa
49 5.0 3.3 1.4 0.2 setosa
key -  versicolor

sepal_length sepal_width petal_length petal_width species
50 7.0 3.2 4.7 1.4 versicolor
51 6.4 3.2 4.5 1.5 versicolor
52 6.9 3.1 4.9 1.5 versicolor
53 5.5 2.3 4.0 1.3 versicolor
54 6.5 2.8 4.6 1.5 versicolor
55 5.7 2.8 4.5 1.3 versicolor
56 6.3 3.3 4.7 1.6 versicolor
57 4.9 2.4 3.3 1.0 versicolor
58 6.6 2.9 4.6 1.3 versicolor
59 5.2 2.7 3.9 1.4 versicolor
60 5.0 2.0 3.5 1.0 versicolor
61 5.9 3.0 4.2 1.5 versicolor
62 6.0 2.2 4.0 1.0 versicolor
63 6.1 2.9 4.7 1.4 versicolor
64 5.6 2.9 3.6 1.3 versicolor
65 6.7 3.1 4.4 1.4 versicolor
66 5.6 3.0 4.5 1.5 versicolor
67 5.8 2.7 4.1 1.0 versicolor
68 6.2 2.2 4.5 1.5 versicolor
69 5.6 2.5 3.9 1.1 versicolor
70 5.9 3.2 4.8 1.8 versicolor
71 6.1 2.8 4.0 1.3 versicolor
72 6.3 2.5 4.9 1.5 versicolor
73 6.1 2.8 4.7 1.2 versicolor
74 6.4 2.9 4.3 1.3 versicolor
75 6.6 3.0 4.4 1.4 versicolor
76 6.8 2.8 4.8 1.4 versicolor
77 6.7 3.0 5.0 1.7 versicolor
78 6.0 2.9 4.5 1.5 versicolor
79 5.7 2.6 3.5 1.0 versicolor
80 5.5 2.4 3.8 1.1 versicolor
81 5.5 2.4 3.7 1.0 versicolor
82 5.8 2.7 3.9 1.2 versicolor
83 6.0 2.7 5.1 1.6 versicolor
84 5.4 3.0 4.5 1.5 versicolor
85 6.0 3.4 4.5 1.6 versicolor
86 6.7 3.1 4.7 1.5 versicolor
87 6.3 2.3 4.4 1.3 versicolor
88 5.6 3.0 4.1 1.3 versicolor
89 5.5 2.5 4.0 1.3 versicolor
90 5.5 2.6 4.4 1.2 versicolor
91 6.1 3.0 4.6 1.4 versicolor
92 5.8 2.6 4.0 1.2 versicolor
93 5.0 2.3 3.3 1.0 versicolor
94 5.6 2.7 4.2 1.3 versicolor
95 5.7 3.0 4.2 1.2 versicolor
96 5.7 2.9 4.2 1.3 versicolor
97 6.2 2.9 4.3 1.3 versicolor
98 5.1 2.5 3.0 1.1 versicolor
99 5.7 2.8 4.1 1.3 versicolor
key -  virginica

sepal_length sepal_width petal_length petal_width species
100 6.3 3.3 6.0 2.5 virginica
101 5.8 2.7 5.1 1.9 virginica
102 7.1 3.0 5.9 2.1 virginica
103 6.3 2.9 5.6 1.8 virginica
104 6.5 3.0 5.8 2.2 virginica
105 7.6 3.0 6.6 2.1 virginica
106 4.9 2.5 4.5 1.7 virginica
107 7.3 2.9 6.3 1.8 virginica
108 6.7 2.5 5.8 1.8 virginica
109 7.2 3.6 6.1 2.5 virginica
110 6.5 3.2 5.1 2.0 virginica
111 6.4 2.7 5.3 1.9 virginica
112 6.8 3.0 5.5 2.1 virginica
113 5.7 2.5 5.0 2.0 virginica
114 5.8 2.8 5.1 2.4 virginica
115 6.4 3.2 5.3 2.3 virginica
116 6.5 3.0 5.5 1.8 virginica
117 7.7 3.8 6.7 2.2 virginica
118 7.7 2.6 6.9 2.3 virginica
119 6.0 2.2 5.0 1.5 virginica
120 6.9 3.2 5.7 2.3 virginica
121 5.6 2.8 4.9 2.0 virginica
122 7.7 2.8 6.7 2.0 virginica
123 6.3 2.7 4.9 1.8 virginica
124 6.7 3.3 5.7 2.1 virginica
125 7.2 3.2 6.0 1.8 virginica
126 6.2 2.8 4.8 1.8 virginica
127 6.1 3.0 4.9 1.8 virginica
128 6.4 2.8 5.6 2.1 virginica
129 7.2 3.0 5.8 1.6 virginica
130 7.4 2.8 6.1 1.9 virginica
131 7.9 3.8 6.4 2.0 virginica
132 6.4 2.8 5.6 2.2 virginica
133 6.3 2.8 5.1 1.5 virginica
134 6.1 2.6 5.6 1.4 virginica
135 7.7 3.0 6.1 2.3 virginica
136 6.3 3.4 5.6 2.4 virginica
137 6.4 3.1 5.5 1.8 virginica
138 6.0 3.0 4.8 1.8 virginica
139 6.9 3.1 5.4 2.1 virginica
140 6.7 3.1 5.6 2.4 virginica
141 6.9 3.1 5.1 2.3 virginica
142 5.8 2.7 5.1 1.9 virginica
143 6.8 3.2 5.9 2.3 virginica
144 6.7 3.3 5.7 2.5 virginica
145 6.7 3.0 5.2 2.3 virginica
146 6.3 2.5 5.0 1.9 virginica
147 6.5 3.0 5.2 2.0 virginica
148 6.2 3.4 5.4 2.3 virginica
149 5.9 3.0 5.1 1.8 virginica
In [297]:
irisFrm.sort_values(by='petal_length', ascending = False).groupby('species').get_group('setosa')
Out[297]:
sepal_length sepal_width petal_length petal_width species
24 4.8 3.4 1.9 0.2 setosa
44 5.1 3.8 1.9 0.4 setosa
18 5.7 3.8 1.7 0.3 setosa
5 5.4 3.9 1.7 0.4 setosa
23 5.1 3.3 1.7 0.5 setosa
20 5.4 3.4 1.7 0.2 setosa
30 4.8 3.1 1.6 0.2 setosa
46 5.1 3.8 1.6 0.2 setosa
11 4.8 3.4 1.6 0.2 setosa
25 5.0 3.0 1.6 0.2 setosa
26 5.0 3.4 1.6 0.4 setosa
29 4.7 3.2 1.6 0.2 setosa
43 5.0 3.5 1.6 0.6 setosa
7 5.0 3.4 1.5 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
10 5.4 3.7 1.5 0.2 setosa
15 5.7 4.4 1.5 0.4 setosa
21 5.1 3.7 1.5 0.4 setosa
34 4.9 3.1 1.5 0.2 setosa
48 5.3 3.7 1.5 0.2 setosa
39 5.1 3.4 1.5 0.2 setosa
19 5.1 3.8 1.5 0.3 setosa
9 4.9 3.1 1.5 0.1 setosa
31 5.4 3.4 1.5 0.4 setosa
27 5.2 3.5 1.5 0.2 setosa
32 5.2 4.1 1.5 0.1 setosa
1 4.9 3.0 1.4 0.2 setosa
0 5.1 3.5 1.4 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
28 5.2 3.4 1.4 0.2 setosa
8 4.4 2.9 1.4 0.2 setosa
6 4.6 3.4 1.4 0.3 setosa
12 4.8 3.0 1.4 0.1 setosa
17 5.1 3.5 1.4 0.3 setosa
47 4.6 3.2 1.4 0.2 setosa
33 5.5 4.2 1.4 0.2 setosa
49 5.0 3.3 1.4 0.2 setosa
45 4.8 3.0 1.4 0.3 setosa
37 4.9 3.6 1.4 0.1 setosa
36 5.5 3.5 1.3 0.2 setosa
16 5.4 3.9 1.3 0.4 setosa
2 4.7 3.2 1.3 0.2 setosa
38 4.4 3.0 1.3 0.2 setosa
42 4.4 3.2 1.3 0.2 setosa
40 5.0 3.5 1.3 0.3 setosa
41 4.5 2.3 1.3 0.3 setosa
35 5.0 3.2 1.2 0.2 setosa
14 5.8 4.0 1.2 0.2 setosa
13 4.3 3.0 1.1 0.1 setosa
22 4.6 3.6 1.0 0.2 setosa
In [298]:
titanicFrm.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB
In [301]:
print('age 결측값 - ')
print(titanicFrm['age'].isnull().sum())
age 결측값 - 
177
In [302]:
titanicFrm.groupby('sex')['age'].mean()
Out[302]:
sex
female    27.915709
male      30.726645
Name: age, dtype: float64
In [304]:
titanicFrm.groupby('sex')['age'].groups
Out[304]:
{'female': [1, 2, 3, 8, 9, 10, 11, 14, 15, 18, 19, 22, 24, 25, 28, 31, 32, 38, 39, 40, 41, 43, 44, 47, 49, 52, 53, 56, 58, 61, 66, 68, 71, 79, 82, 84, 85, 88, 98, 100, 106, 109, 111, 113, 114, 119, 123, 128, 132, 133, 136, 140, 141, 142, 147, 151, 156, 161, 166, 167, 172, 177, 180, 184, 186, 190, 192, 194, 195, 198, 199, 205, 208, 211, 215, 216, 218, 229, 230, 233, 235, 237, 240, 241, 246, 247, 251, 254, 255, 256, 257, 258, 259, 264, 268, 269, 272, 274, 275, 276, ...], 'male': [0, 4, 5, 6, 7, 12, 13, 16, 17, 20, 21, 23, 26, 27, 29, 30, 33, 34, 35, 36, 37, 42, 45, 46, 48, 50, 51, 54, 55, 57, 59, 60, 62, 63, 64, 65, 67, 69, 70, 72, 73, 74, 75, 76, 77, 78, 80, 81, 83, 86, 87, 89, 90, 91, 92, 93, 94, 95, 96, 97, 99, 101, 102, 103, 104, 105, 107, 108, 110, 112, 115, 116, 117, 118, 120, 121, 122, 124, 125, 126, 127, 129, 130, 131, 134, 135, 137, 138, 139, 143, 144, 145, 146, 148, 149, 150, 152, 153, 154, 155, ...]}
In [315]:
tmp = titanicFrm.groupby('sex')['age'].apply(lambda x : x.fillna(x.mean()))
In [316]:
titanicFrm['age'] = tmp.values
print('age 결측값 - ')
print(titanicFrm['age'].isnull().sum())
age 결측값 - 
0
In [319]:
titanicFrm
Out[319]:
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
0 0 3 male 38.0 1 0 7.2500 S Third man True NaN Southampton no False
1 1 1 female 26.0 1 0 71.2833 C First woman False C Cherbourg yes False
2 1 3 female 35.0 0 0 7.9250 S Third woman False NaN Southampton yes True
3 1 1 female 27.0 1 0 53.1000 S First woman False C Southampton yes False
4 0 3 male 14.0 0 0 8.0500 S Third man True NaN Southampton no True
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
886 0 2 male 28.0 0 0 13.0000 S Second man True NaN Southampton no True
887 1 1 female 25.0 0 0 30.0000 S First woman False B Southampton yes True
888 0 3 female 27.0 1 2 23.4500 S Third woman False NaN Southampton no False
889 1 1 male 26.0 0 0 30.0000 C First man True C Cherbourg yes True
890 0 3 male 32.0 0 0 7.7500 Q Third man True NaN Queenstown no True

891 rows × 15 columns

In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: