학습목표¶

  • 데이터 타입 : Series, DataFrame
  • Pandas 통해서 데이터분석과 시각화(matplotlib, seaborn, folium)
  • 분석: 기술적 통계 분석, 탐색적 데이터 분석(EDA)
In [1]:
import numpy as np
import pandas as pd

def aryInfo(ary) :
    print('type  - ', type(ary))
    print('shape - ', ary.shape)
    print('dim   - ', ary.ndim)
    print('dtype - ', ary.dtype)
    print()
    print('data - ')
    print(ary)
  • 파일 입출력(.csv) : loadtxt(), savetxt()
In [5]:
rawData = np.loadtxt('./data/기후통계분석.csv',
                    dtype = 'U',
                    skiprows = 1,
                    delimiter = ',')

#print('type - ', type(rawData))
aryInfo(rawData)
type  -  <class 'numpy.ndarray'>
shape -  (40414, 5)
dim   -  2
dtype -  <U10

data - 
[['1907-10-01' '108' '13.5' '7.9' '20.7']
 ['1907-10-02' '108' '16.2' '7.9' '22']
 ['1907-10-03' '108' '16.2' '13.1' '21.3']
 ...
 ['2021-08-23' '108' '22.4' '21' '24']
 ['2021-08-24' '108' '23.4' '21.1' '26.4']
 ['2021-08-25' '108' '25' '23.5' '27.3']]
In [9]:
rawData[0 : 6, : ]
Out[9]:
array([['1907-10-01', '108', '13.5', '7.9', '20.7'],
       ['1907-10-02', '108', '16.2', '7.9', '22'],
       ['1907-10-03', '108', '16.2', '13.1', '21.3'],
       ['1907-10-04', '108', '16.5', '11.2', '22'],
       ['1907-10-05', '108', '17.6', '10.9', '25.4'],
       ['1907-10-06', '108', '13', '11.2', '21.3']], dtype='<U10')
In [12]:
temp = rawData[: , -1]
#print(temp)
aryInfo(temp)
type  -  <class 'numpy.ndarray'>
shape -  (40414,)
dim   -  1
dtype -  <U10

data - 
['20.7' '22' '21.3' ... '24' '26.4' '27.3']
In [13]:
print('data - ', temp[:10])
data -  ['20.7' '22' '21.3' '22' '25.4' '21.3' '16.1' '14.9' '21.1' '24.1']
In [14]:
temp = temp.astype(float)
print('data - ', temp[:10])
data -  [20.7 22.  21.3 22.  25.4 21.3 16.1 14.9 21.1 24.1]
In [35]:
# Quiz
# 최고기온이 가장 높은 년도의 기후정보를 확인하고 싶다면?

print(rawData[np.argmax(temp)])

maxIdx = np.argmax(rawData[:, 4].astype(float))
print(rawData[maxIdx])

print('max - ', np.max(temp))
print('argmax - ', np.argmax(temp))
print('argsort - ', np.argsort(temp)[-1])
print('argsort - ', np.argsort(temp)[::-1][0])
print()
print('answer - ', rawData[np.argsort(temp)[::-1][0], :])
['2018-08-01' '108' '33.6' '27.8' '39.6']
['2018-08-01' '108' '33.6' '27.8' '39.6']
max -  39.6
argmax -  39293
argsort -  39293
argsort -  39293

answer -  ['2018-08-01' '108' '33.6' '27.8' '39.6']
In [48]:
# Quiz
# 평균기온이 가장 낮은 년도의 기후정보를 확인하고 싶다면?
temp = rawData[: , 2]

print('answer - ', rawData[np.argmin(temp.astype(float)) , :])
answer -  ['1915-01-13' '108' '-19.2' '-21.3' '-16.3']
  • 벡터와 행렬을 연산할 수 있을까?
  • 가능하다
  • 왜? 브로드 캐스팅 때문에 가능
In [49]:
ary01 = np.arange(1,100001)
ary02 = np.arange(100001, 200001)
print('len - ', len(ary02), len(ary02))
len -  100000 100000
In [51]:
tempAry = np.zeros_like(ary01)
print(tempAry, len(tempAry))
[0 0 0 ... 0 0 0] 100000
In [52]:
%%time
for idx in range(len(ary02)) :
    tempAry[idx] = ary01[idx] + ary02[idx]
print()
print('answer - ', tempAry)
answer -  [100002 100004 100006 ... 299996 299998 300000]
CPU times: total: 31.2 ms
Wall time: 27.9 ms
In [53]:
%%time
tempAry = ary01 + ary02
print()
print('answer - ', tempAry)
answer -  [100002 100004 100006 ... 299996 299998 300000]
CPU times: total: 0 ns
Wall time: 895 μs
In [54]:
ary01 = np.arange(3)
print(ary01)
print(ary01 * 3)
[0 1 2]
[0 3 6]
In [55]:
ary02 = np.arange(12).reshape(-1, 4)
aryInfo(ary02)
type  -  <class 'numpy.ndarray'>
shape -  (3, 4)
dim   -  2
dtype -  int64

data - 
[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
In [63]:
ary01[ : , np.newaxis] + ary02
Out[63]:
array([[ 0,  1,  2,  3],
       [ 5,  6,  7,  8],
       [10, 11, 12, 13]])
  • np.newaxis : 배열에 새로운 축(axis)을 추가해서 차원을 늘리는 역할
In [56]:
ary = np.array([1,2,3,4])
aryInfo(ary)
type  -  <class 'numpy.ndarray'>
shape -  (4,)
dim   -  1
dtype -  int64

data - 
[1 2 3 4]
In [57]:
newAry = ary[ : , np.newaxis]
aryInfo(newAry)
type  -  <class 'numpy.ndarray'>
shape -  (4, 1)
dim   -  2
dtype -  int64

data - 
[[1]
 [2]
 [3]
 [4]]
In [58]:
newAry = ary[ np.newaxis, :]
aryInfo(newAry)
type  -  <class 'numpy.ndarray'>
shape -  (1, 4)
dim   -  2
dtype -  int64

data - 
[[1 2 3 4]]
In [64]:
'''
series(index + value)
index : 정수, 문자, 날짜, 시간 가능하고 중복허용 X
'''
Out[64]:
'\nseries(index + value)\nindex : 정수, 문자, 날짜, 시간 가능하고 중복허용 X\n'
In [67]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json as j

# warning 제거
import warnings
warnings.filterwarnings('ignore')

print('numpy version - ', np.__version__)
print('pandas version - ', pd.__version__)
numpy version -  2.1.3
pandas version -  2.2.3
In [75]:
lst = [1,2,3,4,5]
print('type - ', type(lst))

print()
ary = np.array(lst)
print('type - ', type(ary))

print()
series = pd.Series(ary)
print('type - ', type(series))
print()
print(series)
print('index - ', series.index)
print('values - ', series.values , type(series.values))
print('dtype - ', series.dtype)
type -  <class 'list'>

type -  <class 'numpy.ndarray'>

type -  <class 'pandas.core.series.Series'>

0    1
1    2
2    3
3    4
4    5
dtype: int64
index -  RangeIndex(start=0, stop=5, step=1)
values -  [1 2 3 4 5] <class 'numpy.ndarray'>
dtype -  int64
In [164]:
def aryInfo(ary) :
    print('type  - ', type(ary))
    print('shape - ', ary.shape)
    print('dim   - ', ary.ndim)
    print('dtype - ', ary.dtype)
    print()
    print('data - ')
    print(ary)
    
def seriesInfo(s) :
    print('type - ', type(s))
    print('index - ', s.index)
    print('values - ', s.values)
    print('dtype - ', s.dtype)
    print()
    print('data - ')
    print(s)

def frmInfo(frm):
    print('type - ', type(frm))
    print('shape - ', frm.shape)
    print('ndim - ', frm.ndim)
    print('row idx - ', frm.index, type(frm.index))
    print('col idx - ', frm.columns, type(frm.columns))
    print('values - ', type(frm.values) )
    print(frm.values)
    print('data - ')
    print(frm)
In [77]:
# 문자인덱스로 시리즈 만든다면?
series = pd.Series({'idx01' : 1, 'idx02' : 2, 'idx03' : 3})
seriesInfo(series)
type -  <class 'pandas.core.series.Series'>
index -  Index(['idx01', 'idx02', 'idx03'], dtype='object')
values -  [1 2 3]
dtype -  int64

data - 
idx01    1
idx02    2
idx03    3
dtype: int64
In [95]:
# series = pd.Series(data = [1,2,3,4,5],
#                     index = ['서초', '송파', '강남', '삼성', '중구'] )
# seriesInfo(series)

series = pd.Series(data = ['임섭순','2025-11-04','Male',True],
                    index = ['이름', '생년월일', '성별', '결혼여부'] )
series.name = '사용자 정보'
series.index.name = '신상 정보'
seriesInfo(series)

print()
print('index - ', series[0], series['이름'])
print('multi indexing - ', series[[0,2]], type(series[[0,2]]))
print(series[['이름', '성별']], type(series[['이름', '성별']]))
print()
print('slicing - ', series[0:3], series['이름':'성별'])
type -  <class 'pandas.core.series.Series'>
index -  Index(['이름', '생년월일', '성별', '결혼여부'], dtype='object', name='신상 정보')
values -  ['임섭순' '2025-11-04' 'Male' True]
dtype -  object

data - 
신상 정보
이름             임섭순
생년월일    2025-11-04
성별            Male
결혼여부          True
Name: 사용자 정보, dtype: object

index -  임섭순 임섭순
multi indexing -  신상 정보
이름     임섭순
성별    Male
Name: 사용자 정보, dtype: object <class 'pandas.core.series.Series'>
신상 정보
이름     임섭순
성별    Male
Name: 사용자 정보, dtype: object <class 'pandas.core.series.Series'>

slicing -  신상 정보
이름             임섭순
생년월일    2025-11-04
성별            Male
Name: 사용자 정보, dtype: object 신상 정보
이름             임섭순
생년월일    2025-11-04
성별            Male
Name: 사용자 정보, dtype: object
In [96]:
print('dir - ', dir(series))
dir -  ['T', '_AXIS_LEN', '_AXIS_ORDERS', '_AXIS_TO_AXIS_NUMBER', '_HANDLED_TYPES', '__abs__', '__add__', '__and__', '__annotations__', '__array__', '__array_priority__', '__array_ufunc__', '__bool__', '__class__', '__column_consortium_standard__', '__contains__', '__copy__', '__deepcopy__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__divmod__', '__doc__', '__eq__', '__finalize__', '__firstlineno__', '__float__', '__floordiv__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__iadd__', '__iand__', '__ifloordiv__', '__imod__', '__imul__', '__init__', '__init_subclass__', '__int__', '__invert__', '__ior__', '__ipow__', '__isub__', '__iter__', '__itruediv__', '__ixor__', '__le__', '__len__', '__lt__', '__matmul__', '__mod__', '__module__', '__mul__', '__ne__', '__neg__', '__new__', '__nonzero__', '__or__', '__pandas_priority__', '__pos__', '__pow__', '__radd__', '__rand__', '__rdivmod__', '__reduce__', '__reduce_ex__', '__repr__', '__rfloordiv__', '__rmatmul__', '__rmod__', '__rmul__', '__ror__', '__round__', '__rpow__', '__rsub__', '__rtruediv__', '__rxor__', '__setattr__', '__setitem__', '__setstate__', '__sizeof__', '__static_attributes__', '__str__', '__sub__', '__subclasshook__', '__truediv__', '__weakref__', '__xor__', '_accessors', '_accum_func', '_agg_examples_doc', '_agg_see_also_doc', '_align_for_op', '_align_frame', '_align_series', '_append', '_arith_method', '_as_manager', '_attrs', '_binop', '_can_hold_na', '_check_inplace_and_allows_duplicate_labels', '_check_is_chained_assignment_possible', '_check_label_or_level_ambiguity', '_check_setitem_copy', '_clear_item_cache', '_clip_with_one_bound', '_clip_with_scalar', '_cmp_method', '_consolidate', '_consolidate_inplace', '_construct_axes_dict', '_construct_result', '_constructor', '_constructor_expanddim', '_constructor_expanddim_from_mgr', '_constructor_from_mgr', '_data', '_deprecate_downcast', '_dir_additions', '_dir_deletions', '_drop_axis', '_drop_labels_or_levels', '_duplicated', '_find_valid_index', '_flags', '_flex_method', '_from_mgr', '_get_axis', '_get_axis_name', '_get_axis_number', '_get_axis_resolvers', '_get_block_manager_axis', '_get_bool_data', '_get_cacher', '_get_cleaned_column_resolvers', '_get_index_resolvers', '_get_label_or_level_values', '_get_numeric_data', '_get_rows_with_mask', '_get_value', '_get_values_tuple', '_get_with', '_getitem_slice', '_gotitem', '_hidden_attrs', '_indexed_same', '_info_axis', '_info_axis_name', '_info_axis_number', '_init_dict', '_init_mgr', '_inplace_method', '_internal_names', '_internal_names_set', '_is_cached', '_is_copy', '_is_label_or_level_reference', '_is_label_reference', '_is_level_reference', '_is_mixed_type', '_is_view', '_is_view_after_cow_rules', '_item_cache', '_ixs', '_logical_func', '_logical_method', '_map_values', '_maybe_update_cacher', '_memory_usage', '_metadata', '_mgr', '_min_count_stat_function', '_name', '_needs_reindex_multi', '_pad_or_backfill', '_protect_consolidate', '_reduce', '_references', '_reindex_axes', '_reindex_indexer', '_reindex_multi', '_reindex_with_indexers', '_rename', '_replace_single', '_repr_data_resource_', '_repr_latex_', '_reset_cache', '_reset_cacher', '_set_as_cached', '_set_axis', '_set_axis_name', '_set_axis_nocheck', '_set_is_copy', '_set_labels', '_set_name', '_set_value', '_set_values', '_set_with', '_set_with_engine', '_shift_with_freq', '_slice', '_stat_function', '_stat_function_ddof', '_take_with_is_copy', '_to_latex_via_styler', '_typ', '_update_inplace', '_validate_dtype', '_values', '_where', 'abs', 'add', 'add_prefix', 'add_suffix', 'agg', 'aggregate', 'align', 'all', 'any', 'apply', 'argmax', 'argmin', 'argsort', 'array', 'asfreq', 'asof', 'astype', 'at', 'at_time', 'attrs', 'autocorr', 'axes', 'backfill', 'between', 'between_time', 'bfill', 'bool', 'case_when', 'clip', 'combine', 'combine_first', 'compare', 'convert_dtypes', 'copy', 'corr', 'count', 'cov', 'cummax', 'cummin', 'cumprod', 'cumsum', 'describe', 'diff', 'div', 'divide', 'divmod', 'dot', 'drop', 'drop_duplicates', 'droplevel', 'dropna', 'dtype', 'dtypes', 'duplicated', 'empty', 'eq', 'equals', 'ewm', 'expanding', 'explode', 'factorize', 'ffill', 'fillna', 'filter', 'first', 'first_valid_index', 'flags', 'floordiv', 'ge', 'get', 'groupby', 'gt', 'hasnans', 'head', 'hist', 'iat', 'idxmax', 'idxmin', 'iloc', 'index', 'infer_objects', 'info', 'interpolate', 'is_monotonic_decreasing', 'is_monotonic_increasing', 'is_unique', 'isin', 'isna', 'isnull', 'item', 'items', 'keys', 'kurt', 'kurtosis', 'last', 'last_valid_index', 'le', 'list', 'loc', 'lt', 'map', 'mask', 'max', 'mean', 'median', 'memory_usage', 'min', 'mod', 'mode', 'mul', 'multiply', 'name', 'nbytes', 'ndim', 'ne', 'nlargest', 'notna', 'notnull', 'nsmallest', 'nunique', 'pad', 'pct_change', 'pipe', 'plot', 'pop', 'pow', 'prod', 'product', 'quantile', 'radd', 'rank', 'ravel', 'rdiv', 'rdivmod', 'reindex', 'reindex_like', 'rename', 'rename_axis', 'reorder_levels', 'repeat', 'replace', 'resample', 'reset_index', 'rfloordiv', 'rmod', 'rmul', 'rolling', 'round', 'rpow', 'rsub', 'rtruediv', 'sample', 'searchsorted', 'sem', 'set_axis', 'set_flags', 'shape', 'shift', 'size', 'skew', 'sort_index', 'sort_values', 'squeeze', 'std', 'str', 'struct', 'sub', 'subtract', 'sum', 'swapaxes', 'swaplevel', 'tail', 'take', 'to_clipboard', 'to_csv', 'to_dict', 'to_excel', 'to_frame', 'to_hdf', 'to_json', 'to_latex', 'to_list', 'to_markdown', 'to_numpy', 'to_period', 'to_pickle', 'to_sql', 'to_string', 'to_timestamp', 'to_xarray', 'transform', 'transpose', 'truediv', 'truncate', 'tz_convert', 'tz_localize', 'unique', 'unstack', 'update', 'value_counts', 'values', 'var', 'view', 'where', 'xs', '결혼여부', '생년월일', '성별', '이름']
In [98]:
keyLst = [key for key in series.keys()]
print('key list - ', keyLst)
print()
valueLst = [value for value in series.values]
print('value list - ', valueLst)
key list -  ['이름', '생년월일', '성별', '결혼여부']

value list -  ['임섭순', '2025-11-04', 'Male', True]
In [99]:
series = pd.Series(range(10,21))
seriesInfo(series)
type -  <class 'pandas.core.series.Series'>
index -  RangeIndex(start=0, stop=11, step=1)
values -  [10 11 12 13 14 15 16 17 18 19 20]
dtype -  int64

data - 
0     10
1     11
2     12
3     13
4     14
5     15
6     16
7     17
8     18
9     19
10    20
dtype: int64
In [103]:
print(series * 2)
print(series.values * 2 , type(series.values * 2))
print('boolean indexing - ')
print(series.values%2 ==0)
print(series.values[series.values%2 ==0])
0     20
1     22
2     24
3     26
4     28
5     30
6     32
7     34
8     36
9     38
10    40
dtype: int64
[20 22 24 26 28 30 32 34 36 38 40] <class 'numpy.ndarray'>
boolean indexing - 
[ True False  True False  True False  True False  True False  True]
[10 12 14 16 18 20]
In [104]:
from datetime import date, datetime, timedelta
In [105]:
today = date(2025, 11, 4)
print(today) 
2025-11-04
In [118]:
# 오늘 날짜부터 10일간의 날짜를 생성하여 시리즈의 인덱스로 사용하고 싶다면?

today = date(2025, 11, 4)
idx = [today + timedelta(days=i) for i in range(10)]
s = pd.Series(range(10), index=dates)
print(s)
2025-11-04    0
2025-11-05    1
2025-11-06    2
2025-11-07    3
2025-11-08    4
2025-11-09    5
2025-11-10    6
2025-11-11    7
2025-11-12    8
2025-11-13    9
dtype: int64
In [119]:
date_index = pd.date_range(start = today, periods = 10)
print(date_index)
DatetimeIndex(['2025-11-04', '2025-11-05', '2025-11-06', '2025-11-07',
               '2025-11-08', '2025-11-09', '2025-11-10', '2025-11-11',
               '2025-11-12', '2025-11-13'],
              dtype='datetime64[ns]', freq='D')
In [122]:
series = pd.Series(data = [np.random.randint(1,100) for _ in range(10)],
                  index = date_index)

print(series)
2025-11-04    57
2025-11-05    85
2025-11-06    43
2025-11-07    85
2025-11-08    44
2025-11-09    33
2025-11-10    83
2025-11-11    48
2025-11-12    64
2025-11-13    17
Freq: D, dtype: int64
In [123]:
print(series['2025-11-04'])
57
In [128]:
# 결측값, null : isnull(), notnull()
series['2025-11-10'] = np.nan
print(series)
2025-11-04    57.0
2025-11-05    85.0
2025-11-06    43.0
2025-11-07    85.0
2025-11-08    44.0
2025-11-09    33.0
2025-11-10     NaN
2025-11-11    48.0
2025-11-12    64.0
2025-11-13    17.0
Freq: D, dtype: float64
In [131]:
pd.isnull(series)
print()
pd.notnull(series)

Out[131]:
2025-11-04     True
2025-11-05     True
2025-11-06     True
2025-11-07     True
2025-11-08     True
2025-11-09     True
2025-11-10    False
2025-11-11     True
2025-11-12     True
2025-11-13     True
Freq: D, dtype: bool
In [136]:
# 결측값은 평균이나 중위수의 값을 대체
series[pd.isnull(series)] = np.mean(series)
print(series)
2025-11-04    57.000000
2025-11-05    85.000000
2025-11-06    43.000000
2025-11-07    85.000000
2025-11-08    44.000000
2025-11-09    33.000000
2025-11-10    52.888889
2025-11-11    48.000000
2025-11-12    64.000000
2025-11-13    17.000000
Freq: D, dtype: float64
In [137]:
# fillna() : 결측값을 원하는 값으로 채우고자 할 때
series['2025-11-10'] = np.nan
print(series)
2025-11-04    57.0
2025-11-05    85.0
2025-11-06    43.0
2025-11-07    85.0
2025-11-08    44.0
2025-11-09    33.0
2025-11-10     NaN
2025-11-11    48.0
2025-11-12    64.0
2025-11-13    17.0
Freq: D, dtype: float64
In [139]:
series = series.fillna(0)
print(series)
2025-11-04    57.0
2025-11-05    85.0
2025-11-06    43.0
2025-11-07    85.0
2025-11-08    44.0
2025-11-09    33.0
2025-11-10     0.0
2025-11-11    48.0
2025-11-12    64.0
2025-11-13    17.0
Freq: D, dtype: float64
In [140]:
# subset (부분집합)
series[3] = np.nan
series[8] = np.nan
print(series)
2025-11-04    57.0
2025-11-05    85.0
2025-11-06    43.0
2025-11-07     NaN
2025-11-08    44.0
2025-11-09    33.0
2025-11-10     0.0
2025-11-11    48.0
2025-11-12     NaN
2025-11-13    17.0
Freq: D, dtype: float64
In [144]:
# Quiz
# 원본데이터에서 결측값을 제외한 subset을 만들고 싶다면?
subset = series[pd.notnull(series)]
print(subset)
2025-11-04    57.0
2025-11-05    85.0
2025-11-06    43.0
2025-11-08    44.0
2025-11-09    33.0
2025-11-10     0.0
2025-11-11    48.0
2025-11-13    17.0
dtype: float64
In [146]:
'''
DataFrame(표 형식 - 행, 열) : .csv 을 읽어서 표로
- series는 인덱스만 있었지만
- 행 인덱스, 열 인덱스가 있다
- pd.DataFrame(data= ,columns= ,index= )
- dict, [[]] , 
'''
Out[146]:
'\nDataFrame(표 형식 - 행, 열) : .csv 을 읽어서 표로\n- series는 인덱스만 있었지만\n- 행 인덱스, 열 인덱스가 있다\n- pd.DataFrame(data= ,columns= ,index= )\n- dict, [[]] , \n'
In [157]:
frm = pd.DataFrame({
    'feature01' : [1,2,3],
    'feature02' : [1,2,3],
    'feature03' : [1,2,3],
})
print(frm)
print('type - ', type(frm))
print('shape - ', frm.shape)
print('ndim - ', frm.ndim)
print('row idx - ', frm.index, type(frm.index))
print('col idx - ', frm.columns, type(frm.columns))
print('values - ', type(frm.values) )
print(frm.values)
   feature01  feature02  feature03
0          1          1          1
1          2          2          2
2          3          3          3
type -  <class 'pandas.core.frame.DataFrame'>
shape -  (3, 3)
ndim -  2
row idx -  RangeIndex(start=0, stop=3, step=1) <class 'pandas.core.indexes.range.RangeIndex'>
col idx -  Index(['feature01', 'feature02', 'feature03'], dtype='object') <class 'pandas.core.indexes.base.Index'>
values -  <class 'numpy.ndarray'>
[[1 1 1]
 [2 2 2]
 [3 3 3]]
In [165]:
frm = pd.DataFrame(data = 
                    [[1,2,3],
                    [1,2,3],
                    [1,2,3]],
                    columns = ['A', 'B', 'C'],
                    index = ['user_'+str(idx) for idx in range(3)])

frmInfo(frm)
type -  <class 'pandas.core.frame.DataFrame'>
shape -  (3, 3)
ndim -  2
row idx -  Index(['user_0', 'user_1', 'user_2'], dtype='object') <class 'pandas.core.indexes.base.Index'>
col idx -  Index(['A', 'B', 'C'], dtype='object') <class 'pandas.core.indexes.base.Index'>
values -  <class 'numpy.ndarray'>
[[1 2 3]
 [1 2 3]
 [1 2 3]]
data - 
        A  B  C
user_0  1  2  3
user_1  1  2  3
user_2  1  2  3
In [176]:
print('전처리(pre-processing) - 열 이름 변경, 인덱스 변경, rename()')
frm.rename(columns = {'A':'name', 'B':'gender', 'C':'isMarraige'},
          inplace = True,
          index = {'user_1' : 'customer01'})
전처리(pre-processing) - 열 이름 변경, 인덱스 변경, rename()
In [179]:
frmInfo(frm)
type -  <class 'pandas.core.frame.DataFrame'>
shape -  (3, 3)
ndim -  2
row idx -  Index(['user_0', 'customer01', 'user_2'], dtype='object') <class 'pandas.core.indexes.base.Index'>
col idx -  Index(['name', 'gender', 'isMarraige'], dtype='object') <class 'pandas.core.indexes.base.Index'>
values -  <class 'numpy.ndarray'>
[[1 2 3]
 [1 2 3]
 [1 2 3]]
data - 
            name  gender  isMarraige
user_0         1       2           3
customer01     1       2           3
user_2         1       2           3
In [186]:
print('데이터 추출 - indexing')
print('type - ', type(frm['name']))
print(frm['name'], type(frm['name']))
print(frm['name'].values, type(frm['name'].values))
데이터 추출 - indexing
type -  <class 'pandas.core.series.Series'>
user_0        1
customer01    1
user_2        1
Name: name, dtype: int64 <class 'pandas.core.series.Series'>
[1 1 1] <class 'numpy.ndarray'>
In [203]:
print('데이터 추가')
frm['age'] = [10,20,30]
데이터 추가
In [204]:
print(frm)
            name  gender  isMarraige  age
user_0         1       2           3   10
customer01     1       2           3   20
user_2         1       2           3   30
In [205]:
print('삭제 - 열')
del frm['age']
삭제 - 열
In [206]:
print(frm)
            name  gender  isMarraige
user_0         1       2           3
customer01     1       2           3
user_2         1       2           3
In [221]:
print('만약 , 행 인덱싱을 하고 싶다면? - 정답 : 슬라이싱(정수, 문자)')
print(frm)
print()
print(frm[0:1]) 
print()
print(frm[ : 'customer01'])
만약 , 행 인덱싱을 하고 싶다면? - 정답 : 슬라이싱(정수, 문자)
            name  gender  isMarraige
user_0         1       2           3
customer01     1       2           3
user_2         1       2           3

        name  gender  isMarraige
user_0     1       2           3

            name  gender  isMarraige
user_0         1       2           3
customer01     1       2           3
In [226]:
print(frm['name'][:'customer01'])
user_0        1
customer01    1
Name: name, dtype: int64
  • json 데이터를 분석하기 위한 판다스 프레임으로
In [228]:
import urllib.request
import json
In [236]:
# sample json api
endPoint = 'https://jsonplaceholder.typicode.com/posts'
response = urllib.request.urlopen(endPoint)
print('response - ')
# print(response.read())
# json parsing
result = json.loads(response.read())
# print(result)
print('type - ', type(result))
print('keys - ', result[0].keys())
response - 
type -  <class 'list'>
keys -  dict_keys(['userId', 'id', 'title', 'body'])
In [237]:
frm = pd.DataFrame(result)
In [269]:
#frmInfo(frm)
In [259]:
# Quiz
# https://dummyjson.com/carts
# 해당 사이트로부터 json 데이터를 불러와 각 장바구니의 userId, total, discountedTotal
# 그리고 포함된 각 product의 title, price, quantity
# 이걸 하나의 행으로 나타내는 frm 생성한다면?
import urllib.request
import json

endPoint = 'https://dummyjson.com/carts'
response = urllib.request.urlopen(endPoint)
result = json.loads(response.read())

carts = result['carts']

rows = []

for cart in carts:
    for prod in cart['products']:
        rows.append({
            "userId" : cart["userId"],
            "total" : cart["total"],
            "discountedTotal" : cart["discountedTotal"],
            "title" : prod["title"],
            "price" : prod["price"],
            "quantity" : prod["quantity"]
        })

frm = pd.DataFrame(rows)
print(frm.head())
   userId      total  discountedTotal                                 title  \
0      33  103774.85         89686.65                       Charger SXT RWD   
1      33  103774.85         89686.65  Apple MacBook Pro 14 Inch Space Grey   
2      33  103774.85         89686.65                    Green Oval Earring   
3      33  103774.85         89686.65                         Apple Airpods   
4     142    4794.80          4288.95                        Cricket Helmet   

      price  quantity  
0  32999.99         3  
1   1999.99         2  
2     24.99         5  
3    129.99         5  
4     44.99         4  
In [252]:
endPoint = 'https://dummyjson.com/carts'
response = urllib.request.urlopen(endPoint)
result = json.loads(response.read())

carts = result['carts']

rows = []
for cart in carts:
    userId = cart['userId']
    total = cart['total']
    discountedTotal = cart['discountedTotal']
    
    for p in cart['products']:
        rows.append({
            'userId': userId,
            'total': total,
            'discountedTotal': discountedTotal,
            'title': p['title'],
            'price': p['price'],
            'quantity': p['quantity']
        })


frm = pd.DataFrame(rows)
print(frm)
     userId      total  discountedTotal                                 title  \
0        33  103774.85         89686.65                       Charger SXT RWD   
1        33  103774.85         89686.65  Apple MacBook Pro 14 Inch Space Grey   
2        33  103774.85         89686.65                    Green Oval Earring   
3        33  103774.85         89686.65                         Apple Airpods   
4       142    4794.80          4288.95                        Cricket Helmet   
..      ...        ...              ...                                   ...   
116     170    3862.43          3488.44                            Volleyball   
117     177  128249.07        118740.76                Marni Red & Black Suit   
118     177  128249.07        118740.76                      Pacifica Touring   
119     177  128249.07        118740.76                              Potatoes   
120     177  128249.07        118740.76                             Plant Pot   

        price  quantity  
0    32999.99         3  
1     1999.99         2  
2       24.99         5  
3      129.99         5  
4       44.99         4  
..        ...       ...  
116     11.99         5  
117    179.99         1  
118  31999.99         4  
119      2.29         4  
120     14.99         4  

[121 rows x 6 columns]
In [261]:
# 통계 분석, 탐색적 데이터 분석
# 브랜드별 평균 상품가 확인

frm.groupby('title')['price'].mean().sort_values(ascending=False).head()
Out[261]:
title
Charger SXT RWD            32999.99
Pacifica Touring           31999.99
300 Touring                28999.99
Rolex Cellini Moonphase    15999.99
Rolex Submariner Watch     13999.99
Name: price, dtype: float64
In [264]:
# 사용자별 총 구매액
users = frm.groupby('userId')['total'].sum().sort_values(ascending=False).head()
In [265]:
import matplotlib.pyplot as plt

users.plot(kind='bar', figsize=(10,4), title="user purchase (USD)")
plt.show()
No description has been provided for this image
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: