import numpy  as np 
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt 
import json   

# warning 제거
import warnings
warnings.filterwarnings('ignore')

# version check 
print('numpy  version - ' , np.__version__)
print('pandas version - ' , pd.__version__)

# 데이터 정보 출력 함수 
def aryInfo(ary) : 
    print('type - ' , type(ary)) 
    print('shape - ' , ary.shape)
    print('ndim  - ' , ary.ndim)
    print('dtype - ' , ary.dtype)
    print()
    print('data  -')
    print(ary)

def seriesInfo(s) :
    print('type   - ' , type(s)) 
    print('index  - ' , s.index)
    print('values - ' , s.values)
    print('dtype  - ' , s.dtype)
    print()
    print('data   - ')
    print(s)

def frmInfo(frm) :
    print('type    - ' , type(frm))
    print('shape   - ' , frm.shape)
    print('ndim    - ' , frm.ndim)
    print('row idx - ' , frm.index , type(frm.index))
    print('col idx - ' , frm.columns , type(frm.columns))
    print('values  - ' , type(frm.values))
    print(frm.values)
    print('data - ') 
    print(frm)

numpy  version -  2.1.3
pandas version -  2.2.3

# 프레임은 데이터가 있는 열의 집합
scores = {
    'kor' : [90,85,100,88,78],
    'eng' : [90,85,100,88,78],
    'mat' : [90,85,100,88,78]
} # 프레임으로 만들면 행이 5개 열이 3개 생성

frm = pd.DataFrame(scores,
                  index = ['강승우', '최호준', '임정섭', '이현우', '오신호'])

frmInfo(frm)

type    -  <class 'pandas.core.frame.DataFrame'>
shape   -  (5, 3)
ndim    -  2
row idx -  Index(['강승우', '최호준', '임정섭', '이현우', '오신호'], dtype='object') <class 'pandas.core.indexes.base.Index'>
col idx -  Index(['kor', 'eng', 'mat'], dtype='object') <class 'pandas.core.indexes.base.Index'>
values  -  <class 'numpy.ndarray'>
[[ 90  90  90]
 [ 85  85  85]
 [100 100 100]
 [ 88  88  88]
 [ 78  78  78]]
data - 
     kor  eng  mat
강승우   90   90   90
최호준   85   85   85
임정섭  100  100  100
이현우   88   88   88
오신호   78   78   78

frm

# Quiz
# 모든 학생의 과목평균 점수를 새로운 열('mean') 추가하고 싶다면?
# axis = 0 (열), axis = 1 (행)
frm.values

array([[ 90,  90,  90],
       [ 85,  85,  85],
       [100, 100, 100],
       [ 88,  88,  88],
       [ 78,  78,  78]])

frm['mean'] = np.mean(frm.values, axis=1).astype(np.int32)
frm

# Quiz
# 최호준 학생의 영어점수를 90점으로 수정하고 평균 점수도 다시 계산
# iloc[], loc[] 행 인덱싱에서 사용하는 함수

frm.loc['최호준', 'eng'] = 90
frm['mean'] = np.mean(frm.values, axis=1).astype(np.int32)
frm

# 시리즈
lee = frm.loc['이현우']
print(lee)
print('type - ', type(lee))

kor     88
eng     88
mat     88
mean    88
Name: 이현우, dtype: int64
type -  <class 'pandas.core.series.Series'>

# 데이터 프레임
lee = frm.loc[['이현우']]
print(lee)
print('type - ', type(lee))

     kor  eng  mat  mean
이현우   88   88   88    88
type -  <class 'pandas.core.frame.DataFrame'>

lim  = frm.loc['임정섭', 'kor' : 'eng']
print(lim)
print('type - ', type(lim))

kor    100
eng    100
Name: 임정섭, dtype: int64
type -  <class 'pandas.core.series.Series'>

titanicRawData = sns.load_dataset('titanic')
print('type - ', type(titanicRawData))
titanicRawData.head()

type -  <class 'pandas.core.frame.DataFrame'>

titanicRawData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
 15  age_by_10    891 non-null    int64   
dtypes: bool(2), category(2), float64(2), int64(5), object(5)
memory usage: 87.6+ KB

# Quiz
# 선실등급(pclass)의 인원수를 확인하고 싶다면?

print('유일값 확인 - unique()')
print(titanicRawData['pclass'].unique())
print(titanicRawData['pclass'].value_counts().values)

유일값 확인 - unique()
[3 1 2]
[491 216 184]

# 데이터 프레임의 컬럼명 확인
print('type - ', type(titanicRawData.columns))
print(titanicRawData.columns)

type -  <class 'pandas.core.indexes.base.Index'>
Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

# Quiz
# 기존나이에 10살을 더해서 age_by_10 열을 추가하고 싶다면?


titanicRawData['age_by_10'] = (titanicRawData['age'].values + 10).astype('int')
titanicRawData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
 15  age_by_10    891 non-null    int64   
dtypes: bool(2), category(2), float64(2), int64(5), object(5)
memory usage: 87.6+ KB

# age_by_10 컬럼을 삭제하고 싶다면?
# drop( , axis = 1, inplace = True)
titanicRawData.drop(['age_by_10'], axis = 1 , inplace = True)

titanicRawData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB

# 요금(fare)에 대한 통계(최대, 최소, 평균, 합계) 확인이 필요하다면?
print('fare max - ', np.max(titanicRawData['fare'].values))
print('fare min - ', np.min(titanicRawData['fare'].values))
print('fare mean - ', np.mean(titanicRawData['fare'].values))

fare max -  512.3292
fare min -  0.0
fare mean -  32.204207968574636

# Quiz
# 선실등급(pclass)이 3등급인 데이터만의 subset 만들고 싶다면?
pclassSubsetFrm = titanicRawData[titanicRawData['pclass'] == 3]
print('type - ', type(pclassSubsetFrm))
print(pclassSubsetFrm)

type -  <class 'pandas.core.frame.DataFrame'>
     survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0           0       3    male  22.0      1      0   7.2500        S  Third   
2           1       3  female  26.0      0      0   7.9250        S  Third   
4           0       3    male  35.0      0      0   8.0500        S  Third   
5           0       3    male   NaN      0      0   8.4583        Q  Third   
7           0       3    male   2.0      3      1  21.0750        S  Third   
..        ...     ...     ...   ...    ...    ...      ...      ...    ...   
882         0       3  female  22.0      0      0  10.5167        S  Third   
884         0       3    male  25.0      0      0   7.0500        S  Third   
885         0       3  female  39.0      0      5  29.1250        Q  Third   
888         0       3  female   NaN      1      2  23.4500        S  Third   
890         0       3    male  32.0      0      0   7.7500        Q  Third   

       who  adult_male deck  embark_town alive  alone  
0      man        True  NaN  Southampton    no  False  
2    woman       False  NaN  Southampton   yes   True  
4      man        True  NaN  Southampton    no   True  
5      man        True  NaN   Queenstown    no   True  
7    child       False  NaN  Southampton    no  False  
..     ...         ...  ...          ...   ...    ...  
882  woman       False  NaN  Southampton    no   True  
884    man        True  NaN  Southampton    no   True  
885  woman       False  NaN   Queenstown    no  False  
888  woman       False  NaN  Southampton    no  False  
890    man        True  NaN   Queenstown    no   True  

[491 rows x 15 columns]

# Quiz
# 위 서브셋에서 성별과(sex), 생존여부(survived)만 가지는 새로운 서브셋을 만들고 싶다면?
newsubset = pclassSubsetFrm[['sex','survived']]
print(newsubset.head())

      sex  survived
0    male         0
1  female         1
2    male         0
3    male         0
4    male         0

# 인덱스 재조정
# reset_index()
pclassSubsetFrm.reset_index(inplace = True)
print(pclassSubsetFrm.head())

   index  survived  pclass     sex   age  sibsp  parch     fare embarked  \
0      0         0       3    male  22.0      1      0   7.2500        S   
1      1         1       3  female  26.0      0      0   7.9250        S   
2      2         0       3    male  35.0      0      0   8.0500        S   
3      3         0       3    male   NaN      0      0   8.4583        Q   
4      4         0       3    male   2.0      3      1  21.0750        S   

   class    who  adult_male deck  embark_town alive  alone  
0  Third    man        True  NaN  Southampton    no  False  
1  Third  woman       False  NaN  Southampton   yes   True  
2  Third    man        True  NaN  Southampton    no   True  
3  Third    man        True  NaN   Queenstown    no   True  
4  Third  child       False  NaN  Southampton    no  False

print(pclassSubsetFrm)

     index  survived  pclass     sex   age  sibsp  parch     fare embarked  \
0        0         0       3    male  22.0      1      0   7.2500        S   
1        1         1       3  female  26.0      0      0   7.9250        S   
2        2         0       3    male  35.0      0      0   8.0500        S   
3        3         0       3    male   NaN      0      0   8.4583        Q   
4        4         0       3    male   2.0      3      1  21.0750        S   
..     ...       ...     ...     ...   ...    ...    ...      ...      ...   
486    486         0       3  female  22.0      0      0  10.5167        S   
487    487         0       3    male  25.0      0      0   7.0500        S   
488    488         0       3  female  39.0      0      5  29.1250        Q   
489    489         0       3  female   NaN      1      2  23.4500        S   
490    490         0       3    male  32.0      0      0   7.7500        Q   

     class    who  adult_male deck  embark_town alive  alone  
0    Third    man        True  NaN  Southampton    no  False  
1    Third  woman       False  NaN  Southampton   yes   True  
2    Third    man        True  NaN  Southampton    no   True  
3    Third    man        True  NaN   Queenstown    no   True  
4    Third  child       False  NaN  Southampton    no  False  
..     ...    ...         ...  ...          ...   ...    ...  
486  Third  woman       False  NaN  Southampton    no   True  
487  Third    man        True  NaN  Southampton    no   True  
488  Third  woman       False  NaN   Queenstown    no  False  
489  Third  woman       False  NaN  Southampton    no  False  
490  Third    man        True  NaN   Queenstown    no   True  

[491 rows x 16 columns]

pclassSubsetFrm.drop('index', axis = 1 , inplace =True)
pclassSubsetFrm

# set_index : 특정 컬럼을 인덱스로 변경하는 함수

pclassSubsetFrm.reset_index(inplace=True)
pclassSubsetFrm

pclassSubsetFrm.set_index('index', inplace=True)
pclassSubsetFrm

# Quiz
# 원본 데이터로부터 나이가 60이상이고 선실등급이 1등급이면서 성별이 여자인 데이터만 추출하여 서브셋을 만들고 싶다면?

sub = titanicRawData[(titanicRawData['age'] >= 60) & (titanicRawData['pclass'] == 1) & (titanicRawData['sex'] == 'female')]
print(sub)

     survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
275         1       1  female  63.0      1      0  77.9583        S  First   
366         1       1  female  60.0      1      0  75.2500        C  First   
829         1       1  female  62.0      0      0  80.0000      NaN  First   

       who  adult_male deck  embark_town alive  alone  
275  woman       False    D  Southampton   yes  False  
366  woman       False    D    Cherbourg   yes  False  
829  woman       False    B          NaN   yes   True

sub = titanicRawData[(titanicRawData['age'] >= 60) & (titanicRawData['pclass'] == 1) & (titanicRawData['sex'] == 'male')]
print(sub)

     survived  pclass   sex   age  sibsp  parch      fare embarked  class  \
54          0       1  male  65.0      0      1   61.9792        C  First   
96          0       1  male  71.0      0      0   34.6542        C  First   
170         0       1  male  61.0      0      0   33.5000        S  First   
252         0       1  male  62.0      0      0   26.5500        S  First   
438         0       1  male  64.0      1      4  263.0000        S  First   
456         0       1  male  65.0      0      0   26.5500        S  First   
493         0       1  male  71.0      0      0   49.5042        C  First   
545         0       1  male  64.0      0      0   26.0000        S  First   
555         0       1  male  62.0      0      0   26.5500        S  First   
587         1       1  male  60.0      1      1   79.2000        C  First   
625         0       1  male  61.0      0      0   32.3208        S  First   
630         1       1  male  80.0      0      0   30.0000        S  First   
694         0       1  male  60.0      0      0   26.5500        S  First   
745         0       1  male  70.0      1      1   71.0000        S  First   

     who  adult_male deck  embark_town alive  alone  
54   man        True    B    Cherbourg    no  False  
96   man        True    A    Cherbourg    no   True  
170  man        True    B  Southampton    no   True  
252  man        True    C  Southampton    no   True  
438  man        True    C  Southampton    no  False  
456  man        True    E  Southampton    no   True  
493  man        True  NaN    Cherbourg    no   True  
545  man        True  NaN  Southampton    no   True  
555  man        True  NaN  Southampton    no   True  
587  man        True    B    Cherbourg   yes  False  
625  man        True    D  Southampton    no   True  
630  man        True    A  Southampton   yes   True  
694  man        True  NaN  Southampton    no   True  
745  man        True    B  Southampton    no  False

# 원본데이터로부터 승객의 나이를 기준으로 내림차순한 서브셋을 만들고 싶다면?
subsetFrm = titanicRawData.sort_values(by='age', ascending=False)
subsetFrm.reset_index(inplace = True)
subsetFrm.drop('index', axis = 1, inplace = True)
subsetFrm

print('성별에 따른 승객수를 시각화하기 위해서 정렬을 한다면 - ')
titanicRawData['sex'].value_counts().sort_values(ascending = False)

성별에 따른 승객수를 시각화하기 위해서 정렬을 한다면 -

sex
male      577
female    314
Name: count, dtype: int64

titanicRawData.sort_index(ascending= False)

namesFrm = pd.read_csv('./data/year2022_baby_name.csv',
                      sep=',',
                      encoding='utf-8')
print('type - ', type(namesFrm))

type -  <class 'pandas.core.frame.DataFrame'>

namesFrm.head()

namesFrm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33838 entries, 0 to 33837
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   NAME    33838 non-null  object
 1   GENDER  33838 non-null  object
 2   COUNT   33838 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 793.2+ KB

# namesFrm.describe() # 숫자데이터 통계정보
namesFrm.columns

Index(['NAME', 'GENDER', 'COUNT'], dtype='object')

# Quiz
# count 열을 기준으로 내림차순 정렬하여 서브셋을 만든다면?
# 인덱스를 확인하고 reset시킨후
# 불필요한 'index' 열 삭제

subset = namesFrm.sort_values(by='COUNT', ascending=False)
subset.reset_index(inplace = True)
subset.drop('index', axis = 1, inplace = True)
subset

# Quiz
# 열 이름을 변경하고자 하고( NAME -> name, GENDER -> gender , COUNT -> count)
# 성별(gender)이 남자인 데이터를 추출한다면?

# type(subset.columns.values)

# for col in subset.columns.values:
#     print(col.lower())

cols = [ col.lower() for col in subset.columns.values]
subset.columns = cols
print(subset)

# subset.rename(columns = cols, inplace = True) # 리스트 형태라서 안됨 딕셔너리로 만들어줘야함
# subset[subset['gender']=='M']

subset.rename(columns = {col: col.lower() for col in subset.columns}, inplace = True) # 딕셔너리 컴프리헨션
subset[subset['gender']=='M']

# subset.rename(columns={'NAME': 'name', 'GENDER': 'gender', 'COUNT': 'count'}, inplace=True)
# M = subset[subset['gender'] == 'M']
# print(M)

           name gender  count
0      Isabella      F  22731
1         Jacob      M  21875
2        Sophia      F  20477
3         Ethan      M  17866
4          Emma      F  17179
...         ...    ...    ...
33833     Xaine      M      5
33834    Xaveon      M      5
33835   Xavious      M      5
33836    Xiomar      M      5
33837     Xylan      M      5

[33838 rows x 3 columns]

frm = pd.read_csv('./data/service_data_groupby_sample.csv',
                      encoding='cp949')
print('type - ', type(frm))
frm

type -  <class 'pandas.core.frame.DataFrame'>

frm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      12 non-null     int64 
 1   gender  12 non-null     object
 2   height  12 non-null     int64 
 3   age     12 non-null     int64 
 4   region  12 non-null     object
dtypes: int64(3), object(2)
memory usage: 612.0+ bytes

# Quiz
# 지역별 나이평균을 확인하고 싶다면??

# type(frm.groupby('region').get_group('경기'))
# type(frm.groupby('region').get_group('경기')['age'])

frm.groupby('region')['age'].mean()

region
경기    32.00
서울    28.25
인천    39.00
충북    33.00
Name: age, dtype: float64

# Quiz
# 성별을 기준으로 그룹을 나누고 싶다면 -
tmp = frm.groupby('gender')[['height']].mean()
tmp.reset_index(inplace = True)
tmp

# 다중통계량 : agg()
subset = frm.drop('region', axis = 1)
subset

subset.groupby('gender').agg(['mean', 'var', 'std'])

frm.groupby('gender')['age'].agg(['max', 'min', 'mean', 'median']).reset_index()

# Quiz
# 성별에 따른 거주지의 최빈값(mode())을 확인하고 싶다면?

# frm.groupby('gender')['region'].agg(lambda x : x.mode())
frm.groupby('gender')['region'].apply(lambda x : x.mode())

gender   
남자      0    서울
여자      0    서울
Name: region, dtype: object

tipsFrm = sns.load_dataset('tips')
tipsFrm.head()

tipsFrm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB

tipsFrm.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

titanicFrm = sns.load_dataset('titanic')
titanicFrm.head()

# Quiz
# tipsFrm 전체 평균 팁은 얼마일까요?

tip = tipsFrm['tip'].mean()
print(tip)

2.99827868852459

# Quiz
# 남성과 여성 중 평균 팁이 더 높은 성별을 확인하고 싶다면?

print(tipsFrm.groupby('sex')['tip'].agg('mean').reset_index())

# 흡연자와 비흡연자 중 편균 팁 비율이 높은 그룹은?
# assign
print(tipsFrm.assign(pct = tipsFrm['tip'] / tipsFrm['total_bill']).groupby('smoker')['pct'].mean())

      sex       tip
0    Male  3.089618
1  Female  2.833448
smoker
Yes    0.163196
No     0.159328
Name: pct, dtype: float64

# Quiz
# 팁이 가장 많이 발생하는 요일을 확인하고 싶다면?

result = tipsFrm.groupby('day')['tip'].sum().sort_values(ascending=False).head(1)
print('type - ', type(result))
print(result)

type -  <class 'pandas.core.series.Series'>
day
Sat    260.4
Name: tip, dtype: float64

# Quiz
# ['Dinner', 'Lunch'] 중 평균 팁 비율이 높은 시간대를 확인하고 싶다면?

# tipsFrm['time'].unique()
# tipsFrm.groupby('time')['tip'].mean()

tipsFrm.assign(pct = tipsFrm['tip'] / tipsFrm['total_bill']).groupby('time')['pct'].mean().sort_values(ascending=False).head(1)

tipsFrm['pct'] = tipsFrm['tip'] / tipsFrm['total_bill']
result = tipsFrm.groupby('time')['pct'].mean().sort_values(ascending = False).head(1)
print(result)

time
Lunch    0.164128
Name: pct, dtype: float64

# Quiz : titanicFrm
# subset - (age,sex,class,fare,survived)

# subset = titanicFrm[['age', 'sex', 'class', 'fare', 'survived']]
# subset.head()

subset = titanicFrm.loc[: ,['age', 'sex', 'class', 'fare', 'survived']]
subset.head()

# Quiz 
# 선실등급에 따른 그룹을 만들고 1등급 승객만 데이터 프레임 형식으로 만들어 본다면?

# firstClass = subset[subset['class'] == 'First']
# print(firstClass) 

grp = subset.groupby('class')
# print(grp)
# print(grp.get_group('First'))

result = grp.get_group('First')
print(result)

#subset.loc[grp.groups['First'].values , : ]

      age     sex  class     fare  survived
1    38.0  female  First  71.2833         1
3    35.0  female  First  53.1000         1
6    54.0    male  First  51.8625         0
11   58.0  female  First  26.5500         1
23   28.0    male  First  35.5000         1
..    ...     ...    ...      ...       ...
871  47.0  female  First  52.5542         1
872  33.0    male  First   5.0000         0
879  56.0  female  First  83.1583         1
887  19.0  female  First  30.0000         1
889  26.0    male  First  30.0000         1

[216 rows x 5 columns]

irisFrm = sns.load_dataset('iris')
irisFrm.head()

irisFrm['species'].value_counts()

species
setosa        50
versicolor    50
virginica     50
Name: count, dtype: int64

grp = irisFrm.groupby('species')
print(grp)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000014B9D9DF050>

print(grp.groups)

{'setosa': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49], 'versicolor': [50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99], 'virginica': [100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149]}

for key, group in grp :
    print('key - ', key)
    print()
    display(group)

key -  setosa

key -  versicolor

key -  virginica

irisFrm.sort_values(by='petal_length', ascending = False).groupby('species').get_group('setosa')

titanicFrm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB

print('age 결측값 - ')
print(titanicFrm['age'].isnull().sum())

age 결측값 - 
177

titanicFrm.groupby('sex')['age'].mean()

sex
female    27.915709
male      30.726645
Name: age, dtype: float64

titanicFrm.groupby('sex')['age'].groups

{'female': [1, 2, 3, 8, 9, 10, 11, 14, 15, 18, 19, 22, 24, 25, 28, 31, 32, 38, 39, 40, 41, 43, 44, 47, 49, 52, 53, 56, 58, 61, 66, 68, 71, 79, 82, 84, 85, 88, 98, 100, 106, 109, 111, 113, 114, 119, 123, 128, 132, 133, 136, 140, 141, 142, 147, 151, 156, 161, 166, 167, 172, 177, 180, 184, 186, 190, 192, 194, 195, 198, 199, 205, 208, 211, 215, 216, 218, 229, 230, 233, 235, 237, 240, 241, 246, 247, 251, 254, 255, 256, 257, 258, 259, 264, 268, 269, 272, 274, 275, 276, ...], 'male': [0, 4, 5, 6, 7, 12, 13, 16, 17, 20, 21, 23, 26, 27, 29, 30, 33, 34, 35, 36, 37, 42, 45, 46, 48, 50, 51, 54, 55, 57, 59, 60, 62, 63, 64, 65, 67, 69, 70, 72, 73, 74, 75, 76, 77, 78, 80, 81, 83, 86, 87, 89, 90, 91, 92, 93, 94, 95, 96, 97, 99, 101, 102, 103, 104, 105, 107, 108, 110, 112, 115, 116, 117, 118, 120, 121, 122, 124, 125, 126, 127, 129, 130, 131, 134, 135, 137, 138, 139, 143, 144, 145, 146, 148, 149, 150, 152, 153, 154, 155, ...]}

tmp = titanicFrm.groupby('sex')['age'].apply(lambda x : x.fillna(x.mean()))

titanicFrm['age'] = tmp.values
print('age 결측값 - ')
print(titanicFrm['age'].isnull().sum())

age 결측값 - 
0

titanicFrm

	id			height			age
	mean	var	std	mean	var	std	mean	var	std
gender
남자	7.5	14.7	3.834058	180.666667	21.066667	4.589844	31.333333	60.266667	7.763161
여자	5.5	11.5	3.391165	161.500000	59.500000	7.713624	29.000000	64.400000	8.024961

	sepal_length	sepal_width	petal_length	petal_width	species
0	5.1	3.5	1.4	0.2	setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa

학습목표¶

sort : 인덱스 기준, 열 값을 기준으로 # ascending = True(오름차순) False(내림차순)¶

통계량 확인¶

	kor	eng	mat
강승우	90	90	90
최호준	85	85	85
임정섭	100	100	100
이현우	88	88	88
오신호	78	78	78

	survived	pclass	sex	age	sibsp	fare	embarked	class	who	adult_male	deck	embark_town	alive	alone
0	0	3	male	22.0	1	7.2500	S	Third	man	True	NaN	Southampton	no	False
1	1	1	female	38.0	1	71.2833	C	First	woman	False	C	Cherbourg	yes	False
2	1	3	female	26.0	0	7.9250	S	Third	woman	False	NaN	Southampton	yes	True
3	1	1	female	35.0	1	53.1000	S	First	woman	False	C	Southampton	yes	False
4	0	3	male	35.0	0	8.0500	S	Third	man	True	NaN	Southampton	no	True

	survived	pclass	sex	age	sibsp	parch	fare	embarked	class	who	adult_male	deck	embark_town	alive	alone
index
0	0	3	male	22.0	1	0	7.2500	S	Third	man	True	NaN	Southampton	no	False
1	1	3	female	26.0	0	0	7.9250	S	Third	woman	False	NaN	Southampton	yes	True
2	0	3	male	35.0	0	0	8.0500	S	Third	man	True	NaN	Southampton	no	True
3	0	3	male	NaN	0	0	8.4583	Q	Third	man	True	NaN	Queenstown	no	True
4	0	3	male	2.0	3	1	21.0750	S	Third	child	False	NaN	Southampton	no	False
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
486	0	3	female	22.0	0	0	10.5167	S	Third	woman	False	NaN	Southampton	no	True
487	0	3	male	25.0	0	0	7.0500	S	Third	man	True	NaN	Southampton	no	True
488	0	3	female	39.0	0	5	29.1250	Q	Third	woman	False	NaN	Queenstown	no	False
489	0	3	female	NaN	1	2	23.4500	S	Third	woman	False	NaN	Southampton	no	False
490	0	3	male	32.0	0	0	7.7500	Q	Third	man	True	NaN	Queenstown	no	True

	survived	pclass	sex	age	sibsp	parch	fare	embarked	class	who	adult_male	deck	embark_town	alive	alone
0	1	1	male	80.0	0	0	30.0000	S	First	man	True	A	Southampton	yes	True
1	0	3	male	74.0	0	0	7.7750	S	Third	man	True	NaN	Southampton	no	True
2	0	1	male	71.0	0	0	49.5042	C	First	man	True	NaN	Cherbourg	no	True
3	0	1	male	71.0	0	0	34.6542	C	First	man	True	A	Cherbourg	no	True
4	0	3	male	70.5	0	0	7.7500	Q	Third	man	True	NaN	Queenstown	no	True
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
886	0	3	male	NaN	0	0	7.2292	C	Third	man	True	NaN	Cherbourg	no	True
887	0	3	female	NaN	8	2	69.5500	S	Third	woman	False	NaN	Southampton	no	False
888	0	3	male	NaN	0	0	9.5000	S	Third	man	True	NaN	Southampton	no	True
889	0	3	male	NaN	0	0	7.8958	S	Third	man	True	NaN	Southampton	no	True
890	0	3	female	NaN	1	2	23.4500	S	Third	woman	False	NaN	Southampton	no	False

	NAME	GENDER	COUNT
0	Isabella	F	22731
1	Sophia	F	20477
2	Emma	F	17179
3	Olivia	F	16860
4	Ava	F	15300

	NAME	GENDER	COUNT
0	Isabella	F	22731
1	Jacob	M	21875
2	Sophia	F	20477
3	Ethan	M	17866
4	Emma	F	17179
...	...	...	...
33833	Xaine	M	5
33834	Xaveon	M	5
33835	Xavious	M	5
33836	Xiomar	M	5
33837	Xylan	M	5

	id	gender	height	age	region
0	1	남자	175	22	서울
1	2	여자	160	23	서울
2	3	여자	161	21	서울
3	4	여자	170	33	서울
4	5	여자	155	35	경기
5	6	남자	181	41	서울
6	7	남자	183	33	충북
7	8	여자	171	22	서울
8	9	남자	188	29	경기
9	10	남자	177	39	인천
10	11	여자	152	40	서울
11	12	남자	180	24	서울

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4

	sepal_length	sepal_width	petal_length	petal_width	species
50	7.0	3.2	4.7	1.4	versicolor
51	6.4	3.2	4.5	1.5	versicolor
52	6.9	3.1	4.9	1.5	versicolor
53	5.5	2.3	4.0	1.3	versicolor
54	6.5	2.8	4.6	1.5	versicolor
55	5.7	2.8	4.5	1.3	versicolor
56	6.3	3.3	4.7	1.6	versicolor
57	4.9	2.4	3.3	1.0	versicolor
58	6.6	2.9	4.6	1.3	versicolor
59	5.2	2.7	3.9	1.4	versicolor
60	5.0	2.0	3.5	1.0	versicolor
61	5.9	3.0	4.2	1.5	versicolor
62	6.0	2.2	4.0	1.0	versicolor
63	6.1	2.9	4.7	1.4	versicolor
64	5.6	2.9	3.6	1.3	versicolor
65	6.7	3.1	4.4	1.4	versicolor
66	5.6	3.0	4.5	1.5	versicolor
67	5.8	2.7	4.1	1.0	versicolor
68	6.2	2.2	4.5	1.5	versicolor
69	5.6	2.5	3.9	1.1	versicolor
70	5.9	3.2	4.8	1.8	versicolor
71	6.1	2.8	4.0	1.3	versicolor
72	6.3	2.5	4.9	1.5	versicolor
73	6.1	2.8	4.7	1.2	versicolor
74	6.4	2.9	4.3	1.3	versicolor
75	6.6	3.0	4.4	1.4	versicolor
76	6.8	2.8	4.8	1.4	versicolor
77	6.7	3.0	5.0	1.7	versicolor
78	6.0	2.9	4.5	1.5	versicolor
79	5.7	2.6	3.5	1.0	versicolor
80	5.5	2.4	3.8	1.1	versicolor
81	5.5	2.4	3.7	1.0	versicolor
82	5.8	2.7	3.9	1.2	versicolor
83	6.0	2.7	5.1	1.6	versicolor
84	5.4	3.0	4.5	1.5	versicolor
85	6.0	3.4	4.5	1.6	versicolor
86	6.7	3.1	4.7	1.5	versicolor
87	6.3	2.3	4.4	1.3	versicolor
88	5.6	3.0	4.1	1.3	versicolor
89	5.5	2.5	4.0	1.3	versicolor
90	5.5	2.6	4.4	1.2	versicolor
91	6.1	3.0	4.6	1.4	versicolor
92	5.8	2.6	4.0	1.2	versicolor
93	5.0	2.3	3.3	1.0	versicolor
94	5.6	2.7	4.2	1.3	versicolor
95	5.7	3.0	4.2	1.2	versicolor
96	5.7	2.9	4.2	1.3	versicolor
97	6.2	2.9	4.3	1.3	versicolor
98	5.1	2.5	3.0	1.1	versicolor
99	5.7	2.8	4.1	1.3	versicolor

	sepal_length	sepal_width	petal_length	petal_width	species
100	6.3	3.3	6.0	2.5	virginica
101	5.8	2.7	5.1	1.9	virginica
102	7.1	3.0	5.9	2.1	virginica
103	6.3	2.9	5.6	1.8	virginica
104	6.5	3.0	5.8	2.2	virginica
105	7.6	3.0	6.6	2.1	virginica
106	4.9	2.5	4.5	1.7	virginica
107	7.3	2.9	6.3	1.8	virginica
108	6.7	2.5	5.8	1.8	virginica
109	7.2	3.6	6.1	2.5	virginica
110	6.5	3.2	5.1	2.0	virginica
111	6.4	2.7	5.3	1.9	virginica
112	6.8	3.0	5.5	2.1	virginica
113	5.7	2.5	5.0	2.0	virginica
114	5.8	2.8	5.1	2.4	virginica
115	6.4	3.2	5.3	2.3	virginica
116	6.5	3.0	5.5	1.8	virginica
117	7.7	3.8	6.7	2.2	virginica
118	7.7	2.6	6.9	2.3	virginica
119	6.0	2.2	5.0	1.5	virginica
120	6.9	3.2	5.7	2.3	virginica
121	5.6	2.8	4.9	2.0	virginica
122	7.7	2.8	6.7	2.0	virginica
123	6.3	2.7	4.9	1.8	virginica
124	6.7	3.3	5.7	2.1	virginica
125	7.2	3.2	6.0	1.8	virginica
126	6.2	2.8	4.8	1.8	virginica
127	6.1	3.0	4.9	1.8	virginica
128	6.4	2.8	5.6	2.1	virginica
129	7.2	3.0	5.8	1.6	virginica
130	7.4	2.8	6.1	1.9	virginica
131	7.9	3.8	6.4	2.0	virginica
132	6.4	2.8	5.6	2.2	virginica
133	6.3	2.8	5.1	1.5	virginica
134	6.1	2.6	5.6	1.4	virginica
135	7.7	3.0	6.1	2.3	virginica
136	6.3	3.4	5.6	2.4	virginica
137	6.4	3.1	5.5	1.8	virginica
138	6.0	3.0	4.8	1.8	virginica
139	6.9	3.1	5.4	2.1	virginica
140	6.7	3.1	5.6	2.4	virginica
141	6.9	3.1	5.1	2.3	virginica
142	5.8	2.7	5.1	1.9	virginica
143	6.8	3.2	5.9	2.3	virginica
144	6.7	3.3	5.7	2.5	virginica
145	6.7	3.0	5.2	2.3	virginica
146	6.3	2.5	5.0	1.9	virginica
147	6.5	3.0	5.2	2.0	virginica
148	6.2	3.4	5.4	2.3	virginica
149	5.9	3.0	5.1	1.8	virginica