학습목표¶
- 데이터 타입 : Series, DataFrame
- Pandas 통해서 데이터분석과 시각화(matplotlib, seaborn, folium)
- 분석: 기술적 통계 분석, 탐색적 데이터 분석(EDA)
In [1]:
import numpy as np
import pandas as pd
def aryInfo(ary) :
print('type - ', type(ary))
print('shape - ', ary.shape)
print('dim - ', ary.ndim)
print('dtype - ', ary.dtype)
print()
print('data - ')
print(ary)
- 파일 입출력(.csv) : loadtxt(), savetxt()
In [5]:
rawData = np.loadtxt('./data/기후통계분석.csv',
dtype = 'U',
skiprows = 1,
delimiter = ',')
#print('type - ', type(rawData))
aryInfo(rawData)
type - <class 'numpy.ndarray'> shape - (40414, 5) dim - 2 dtype - <U10 data - [['1907-10-01' '108' '13.5' '7.9' '20.7'] ['1907-10-02' '108' '16.2' '7.9' '22'] ['1907-10-03' '108' '16.2' '13.1' '21.3'] ... ['2021-08-23' '108' '22.4' '21' '24'] ['2021-08-24' '108' '23.4' '21.1' '26.4'] ['2021-08-25' '108' '25' '23.5' '27.3']]
In [9]:
rawData[0 : 6, : ]
Out[9]:
array([['1907-10-01', '108', '13.5', '7.9', '20.7'],
['1907-10-02', '108', '16.2', '7.9', '22'],
['1907-10-03', '108', '16.2', '13.1', '21.3'],
['1907-10-04', '108', '16.5', '11.2', '22'],
['1907-10-05', '108', '17.6', '10.9', '25.4'],
['1907-10-06', '108', '13', '11.2', '21.3']], dtype='<U10')
In [12]:
temp = rawData[: , -1]
#print(temp)
aryInfo(temp)
type - <class 'numpy.ndarray'> shape - (40414,) dim - 1 dtype - <U10 data - ['20.7' '22' '21.3' ... '24' '26.4' '27.3']
In [13]:
print('data - ', temp[:10])
data - ['20.7' '22' '21.3' '22' '25.4' '21.3' '16.1' '14.9' '21.1' '24.1']
In [14]:
temp = temp.astype(float)
print('data - ', temp[:10])
data - [20.7 22. 21.3 22. 25.4 21.3 16.1 14.9 21.1 24.1]
In [35]:
# Quiz
# 최고기온이 가장 높은 년도의 기후정보를 확인하고 싶다면?
print(rawData[np.argmax(temp)])
maxIdx = np.argmax(rawData[:, 4].astype(float))
print(rawData[maxIdx])
print('max - ', np.max(temp))
print('argmax - ', np.argmax(temp))
print('argsort - ', np.argsort(temp)[-1])
print('argsort - ', np.argsort(temp)[::-1][0])
print()
print('answer - ', rawData[np.argsort(temp)[::-1][0], :])
['2018-08-01' '108' '33.6' '27.8' '39.6'] ['2018-08-01' '108' '33.6' '27.8' '39.6'] max - 39.6 argmax - 39293 argsort - 39293 argsort - 39293 answer - ['2018-08-01' '108' '33.6' '27.8' '39.6']
In [48]:
# Quiz
# 평균기온이 가장 낮은 년도의 기후정보를 확인하고 싶다면?
temp = rawData[: , 2]
print('answer - ', rawData[np.argmin(temp.astype(float)) , :])
answer - ['1915-01-13' '108' '-19.2' '-21.3' '-16.3']
- 벡터와 행렬을 연산할 수 있을까?
- 가능하다
- 왜? 브로드 캐스팅 때문에 가능
In [49]:
ary01 = np.arange(1,100001)
ary02 = np.arange(100001, 200001)
print('len - ', len(ary02), len(ary02))
len - 100000 100000
In [51]:
tempAry = np.zeros_like(ary01)
print(tempAry, len(tempAry))
[0 0 0 ... 0 0 0] 100000
In [52]:
%%time
for idx in range(len(ary02)) :
tempAry[idx] = ary01[idx] + ary02[idx]
print()
print('answer - ', tempAry)
answer - [100002 100004 100006 ... 299996 299998 300000] CPU times: total: 31.2 ms Wall time: 27.9 ms
In [53]:
%%time
tempAry = ary01 + ary02
print()
print('answer - ', tempAry)
answer - [100002 100004 100006 ... 299996 299998 300000] CPU times: total: 0 ns Wall time: 895 μs
In [54]:
ary01 = np.arange(3)
print(ary01)
print(ary01 * 3)
[0 1 2] [0 3 6]
In [55]:
ary02 = np.arange(12).reshape(-1, 4)
aryInfo(ary02)
type - <class 'numpy.ndarray'> shape - (3, 4) dim - 2 dtype - int64 data - [[ 0 1 2 3] [ 4 5 6 7] [ 8 9 10 11]]
In [63]:
ary01[ : , np.newaxis] + ary02
Out[63]:
array([[ 0, 1, 2, 3],
[ 5, 6, 7, 8],
[10, 11, 12, 13]])
- np.newaxis : 배열에 새로운 축(axis)을 추가해서 차원을 늘리는 역할
In [56]:
ary = np.array([1,2,3,4])
aryInfo(ary)
type - <class 'numpy.ndarray'> shape - (4,) dim - 1 dtype - int64 data - [1 2 3 4]
In [57]:
newAry = ary[ : , np.newaxis]
aryInfo(newAry)
type - <class 'numpy.ndarray'> shape - (4, 1) dim - 2 dtype - int64 data - [[1] [2] [3] [4]]
In [58]:
newAry = ary[ np.newaxis, :]
aryInfo(newAry)
type - <class 'numpy.ndarray'> shape - (1, 4) dim - 2 dtype - int64 data - [[1 2 3 4]]
In [64]:
'''
series(index + value)
index : 정수, 문자, 날짜, 시간 가능하고 중복허용 X
'''
Out[64]:
'\nseries(index + value)\nindex : 정수, 문자, 날짜, 시간 가능하고 중복허용 X\n'
In [67]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json as j
# warning 제거
import warnings
warnings.filterwarnings('ignore')
print('numpy version - ', np.__version__)
print('pandas version - ', pd.__version__)
numpy version - 2.1.3 pandas version - 2.2.3
In [75]:
lst = [1,2,3,4,5]
print('type - ', type(lst))
print()
ary = np.array(lst)
print('type - ', type(ary))
print()
series = pd.Series(ary)
print('type - ', type(series))
print()
print(series)
print('index - ', series.index)
print('values - ', series.values , type(series.values))
print('dtype - ', series.dtype)
type - <class 'list'> type - <class 'numpy.ndarray'> type - <class 'pandas.core.series.Series'> 0 1 1 2 2 3 3 4 4 5 dtype: int64 index - RangeIndex(start=0, stop=5, step=1) values - [1 2 3 4 5] <class 'numpy.ndarray'> dtype - int64
In [164]:
def aryInfo(ary) :
print('type - ', type(ary))
print('shape - ', ary.shape)
print('dim - ', ary.ndim)
print('dtype - ', ary.dtype)
print()
print('data - ')
print(ary)
def seriesInfo(s) :
print('type - ', type(s))
print('index - ', s.index)
print('values - ', s.values)
print('dtype - ', s.dtype)
print()
print('data - ')
print(s)
def frmInfo(frm):
print('type - ', type(frm))
print('shape - ', frm.shape)
print('ndim - ', frm.ndim)
print('row idx - ', frm.index, type(frm.index))
print('col idx - ', frm.columns, type(frm.columns))
print('values - ', type(frm.values) )
print(frm.values)
print('data - ')
print(frm)
In [77]:
# 문자인덱스로 시리즈 만든다면?
series = pd.Series({'idx01' : 1, 'idx02' : 2, 'idx03' : 3})
seriesInfo(series)
type - <class 'pandas.core.series.Series'> index - Index(['idx01', 'idx02', 'idx03'], dtype='object') values - [1 2 3] dtype - int64 data - idx01 1 idx02 2 idx03 3 dtype: int64
In [95]:
# series = pd.Series(data = [1,2,3,4,5],
# index = ['서초', '송파', '강남', '삼성', '중구'] )
# seriesInfo(series)
series = pd.Series(data = ['임섭순','2025-11-04','Male',True],
index = ['이름', '생년월일', '성별', '결혼여부'] )
series.name = '사용자 정보'
series.index.name = '신상 정보'
seriesInfo(series)
print()
print('index - ', series[0], series['이름'])
print('multi indexing - ', series[[0,2]], type(series[[0,2]]))
print(series[['이름', '성별']], type(series[['이름', '성별']]))
print()
print('slicing - ', series[0:3], series['이름':'성별'])
type - <class 'pandas.core.series.Series'> index - Index(['이름', '생년월일', '성별', '결혼여부'], dtype='object', name='신상 정보') values - ['임섭순' '2025-11-04' 'Male' True] dtype - object data - 신상 정보 이름 임섭순 생년월일 2025-11-04 성별 Male 결혼여부 True Name: 사용자 정보, dtype: object index - 임섭순 임섭순 multi indexing - 신상 정보 이름 임섭순 성별 Male Name: 사용자 정보, dtype: object <class 'pandas.core.series.Series'> 신상 정보 이름 임섭순 성별 Male Name: 사용자 정보, dtype: object <class 'pandas.core.series.Series'> slicing - 신상 정보 이름 임섭순 생년월일 2025-11-04 성별 Male Name: 사용자 정보, dtype: object 신상 정보 이름 임섭순 생년월일 2025-11-04 성별 Male Name: 사용자 정보, dtype: object
In [96]:
print('dir - ', dir(series))
dir - ['T', '_AXIS_LEN', '_AXIS_ORDERS', '_AXIS_TO_AXIS_NUMBER', '_HANDLED_TYPES', '__abs__', '__add__', '__and__', '__annotations__', '__array__', '__array_priority__', '__array_ufunc__', '__bool__', '__class__', '__column_consortium_standard__', '__contains__', '__copy__', '__deepcopy__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__divmod__', '__doc__', '__eq__', '__finalize__', '__firstlineno__', '__float__', '__floordiv__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__iadd__', '__iand__', '__ifloordiv__', '__imod__', '__imul__', '__init__', '__init_subclass__', '__int__', '__invert__', '__ior__', '__ipow__', '__isub__', '__iter__', '__itruediv__', '__ixor__', '__le__', '__len__', '__lt__', '__matmul__', '__mod__', '__module__', '__mul__', '__ne__', '__neg__', '__new__', '__nonzero__', '__or__', '__pandas_priority__', '__pos__', '__pow__', '__radd__', '__rand__', '__rdivmod__', '__reduce__', '__reduce_ex__', '__repr__', '__rfloordiv__', '__rmatmul__', '__rmod__', '__rmul__', '__ror__', '__round__', '__rpow__', '__rsub__', '__rtruediv__', '__rxor__', '__setattr__', '__setitem__', '__setstate__', '__sizeof__', '__static_attributes__', '__str__', '__sub__', '__subclasshook__', '__truediv__', '__weakref__', '__xor__', '_accessors', '_accum_func', '_agg_examples_doc', '_agg_see_also_doc', '_align_for_op', '_align_frame', '_align_series', '_append', '_arith_method', '_as_manager', '_attrs', '_binop', '_can_hold_na', '_check_inplace_and_allows_duplicate_labels', '_check_is_chained_assignment_possible', '_check_label_or_level_ambiguity', '_check_setitem_copy', '_clear_item_cache', '_clip_with_one_bound', '_clip_with_scalar', '_cmp_method', '_consolidate', '_consolidate_inplace', '_construct_axes_dict', '_construct_result', '_constructor', '_constructor_expanddim', '_constructor_expanddim_from_mgr', '_constructor_from_mgr', '_data', '_deprecate_downcast', '_dir_additions', '_dir_deletions', '_drop_axis', '_drop_labels_or_levels', '_duplicated', '_find_valid_index', '_flags', '_flex_method', '_from_mgr', '_get_axis', '_get_axis_name', '_get_axis_number', '_get_axis_resolvers', '_get_block_manager_axis', '_get_bool_data', '_get_cacher', '_get_cleaned_column_resolvers', '_get_index_resolvers', '_get_label_or_level_values', '_get_numeric_data', '_get_rows_with_mask', '_get_value', '_get_values_tuple', '_get_with', '_getitem_slice', '_gotitem', '_hidden_attrs', '_indexed_same', '_info_axis', '_info_axis_name', '_info_axis_number', '_init_dict', '_init_mgr', '_inplace_method', '_internal_names', '_internal_names_set', '_is_cached', '_is_copy', '_is_label_or_level_reference', '_is_label_reference', '_is_level_reference', '_is_mixed_type', '_is_view', '_is_view_after_cow_rules', '_item_cache', '_ixs', '_logical_func', '_logical_method', '_map_values', '_maybe_update_cacher', '_memory_usage', '_metadata', '_mgr', '_min_count_stat_function', '_name', '_needs_reindex_multi', '_pad_or_backfill', '_protect_consolidate', '_reduce', '_references', '_reindex_axes', '_reindex_indexer', '_reindex_multi', '_reindex_with_indexers', '_rename', '_replace_single', '_repr_data_resource_', '_repr_latex_', '_reset_cache', '_reset_cacher', '_set_as_cached', '_set_axis', '_set_axis_name', '_set_axis_nocheck', '_set_is_copy', '_set_labels', '_set_name', '_set_value', '_set_values', '_set_with', '_set_with_engine', '_shift_with_freq', '_slice', '_stat_function', '_stat_function_ddof', '_take_with_is_copy', '_to_latex_via_styler', '_typ', '_update_inplace', '_validate_dtype', '_values', '_where', 'abs', 'add', 'add_prefix', 'add_suffix', 'agg', 'aggregate', 'align', 'all', 'any', 'apply', 'argmax', 'argmin', 'argsort', 'array', 'asfreq', 'asof', 'astype', 'at', 'at_time', 'attrs', 'autocorr', 'axes', 'backfill', 'between', 'between_time', 'bfill', 'bool', 'case_when', 'clip', 'combine', 'combine_first', 'compare', 'convert_dtypes', 'copy', 'corr', 'count', 'cov', 'cummax', 'cummin', 'cumprod', 'cumsum', 'describe', 'diff', 'div', 'divide', 'divmod', 'dot', 'drop', 'drop_duplicates', 'droplevel', 'dropna', 'dtype', 'dtypes', 'duplicated', 'empty', 'eq', 'equals', 'ewm', 'expanding', 'explode', 'factorize', 'ffill', 'fillna', 'filter', 'first', 'first_valid_index', 'flags', 'floordiv', 'ge', 'get', 'groupby', 'gt', 'hasnans', 'head', 'hist', 'iat', 'idxmax', 'idxmin', 'iloc', 'index', 'infer_objects', 'info', 'interpolate', 'is_monotonic_decreasing', 'is_monotonic_increasing', 'is_unique', 'isin', 'isna', 'isnull', 'item', 'items', 'keys', 'kurt', 'kurtosis', 'last', 'last_valid_index', 'le', 'list', 'loc', 'lt', 'map', 'mask', 'max', 'mean', 'median', 'memory_usage', 'min', 'mod', 'mode', 'mul', 'multiply', 'name', 'nbytes', 'ndim', 'ne', 'nlargest', 'notna', 'notnull', 'nsmallest', 'nunique', 'pad', 'pct_change', 'pipe', 'plot', 'pop', 'pow', 'prod', 'product', 'quantile', 'radd', 'rank', 'ravel', 'rdiv', 'rdivmod', 'reindex', 'reindex_like', 'rename', 'rename_axis', 'reorder_levels', 'repeat', 'replace', 'resample', 'reset_index', 'rfloordiv', 'rmod', 'rmul', 'rolling', 'round', 'rpow', 'rsub', 'rtruediv', 'sample', 'searchsorted', 'sem', 'set_axis', 'set_flags', 'shape', 'shift', 'size', 'skew', 'sort_index', 'sort_values', 'squeeze', 'std', 'str', 'struct', 'sub', 'subtract', 'sum', 'swapaxes', 'swaplevel', 'tail', 'take', 'to_clipboard', 'to_csv', 'to_dict', 'to_excel', 'to_frame', 'to_hdf', 'to_json', 'to_latex', 'to_list', 'to_markdown', 'to_numpy', 'to_period', 'to_pickle', 'to_sql', 'to_string', 'to_timestamp', 'to_xarray', 'transform', 'transpose', 'truediv', 'truncate', 'tz_convert', 'tz_localize', 'unique', 'unstack', 'update', 'value_counts', 'values', 'var', 'view', 'where', 'xs', '결혼여부', '생년월일', '성별', '이름']
In [98]:
keyLst = [key for key in series.keys()]
print('key list - ', keyLst)
print()
valueLst = [value for value in series.values]
print('value list - ', valueLst)
key list - ['이름', '생년월일', '성별', '결혼여부'] value list - ['임섭순', '2025-11-04', 'Male', True]
In [99]:
series = pd.Series(range(10,21))
seriesInfo(series)
type - <class 'pandas.core.series.Series'> index - RangeIndex(start=0, stop=11, step=1) values - [10 11 12 13 14 15 16 17 18 19 20] dtype - int64 data - 0 10 1 11 2 12 3 13 4 14 5 15 6 16 7 17 8 18 9 19 10 20 dtype: int64
In [103]:
print(series * 2)
print(series.values * 2 , type(series.values * 2))
print('boolean indexing - ')
print(series.values%2 ==0)
print(series.values[series.values%2 ==0])
0 20 1 22 2 24 3 26 4 28 5 30 6 32 7 34 8 36 9 38 10 40 dtype: int64 [20 22 24 26 28 30 32 34 36 38 40] <class 'numpy.ndarray'> boolean indexing - [ True False True False True False True False True False True] [10 12 14 16 18 20]
In [104]:
from datetime import date, datetime, timedelta
In [105]:
today = date(2025, 11, 4)
print(today)
2025-11-04
In [118]:
# 오늘 날짜부터 10일간의 날짜를 생성하여 시리즈의 인덱스로 사용하고 싶다면?
today = date(2025, 11, 4)
idx = [today + timedelta(days=i) for i in range(10)]
s = pd.Series(range(10), index=dates)
print(s)
2025-11-04 0 2025-11-05 1 2025-11-06 2 2025-11-07 3 2025-11-08 4 2025-11-09 5 2025-11-10 6 2025-11-11 7 2025-11-12 8 2025-11-13 9 dtype: int64
In [119]:
date_index = pd.date_range(start = today, periods = 10)
print(date_index)
DatetimeIndex(['2025-11-04', '2025-11-05', '2025-11-06', '2025-11-07',
'2025-11-08', '2025-11-09', '2025-11-10', '2025-11-11',
'2025-11-12', '2025-11-13'],
dtype='datetime64[ns]', freq='D')
In [122]:
series = pd.Series(data = [np.random.randint(1,100) for _ in range(10)],
index = date_index)
print(series)
2025-11-04 57 2025-11-05 85 2025-11-06 43 2025-11-07 85 2025-11-08 44 2025-11-09 33 2025-11-10 83 2025-11-11 48 2025-11-12 64 2025-11-13 17 Freq: D, dtype: int64
In [123]:
print(series['2025-11-04'])
57
In [128]:
# 결측값, null : isnull(), notnull()
series['2025-11-10'] = np.nan
print(series)
2025-11-04 57.0 2025-11-05 85.0 2025-11-06 43.0 2025-11-07 85.0 2025-11-08 44.0 2025-11-09 33.0 2025-11-10 NaN 2025-11-11 48.0 2025-11-12 64.0 2025-11-13 17.0 Freq: D, dtype: float64
In [131]:
pd.isnull(series)
print()
pd.notnull(series)
Out[131]:
2025-11-04 True 2025-11-05 True 2025-11-06 True 2025-11-07 True 2025-11-08 True 2025-11-09 True 2025-11-10 False 2025-11-11 True 2025-11-12 True 2025-11-13 True Freq: D, dtype: bool
In [136]:
# 결측값은 평균이나 중위수의 값을 대체
series[pd.isnull(series)] = np.mean(series)
print(series)
2025-11-04 57.000000 2025-11-05 85.000000 2025-11-06 43.000000 2025-11-07 85.000000 2025-11-08 44.000000 2025-11-09 33.000000 2025-11-10 52.888889 2025-11-11 48.000000 2025-11-12 64.000000 2025-11-13 17.000000 Freq: D, dtype: float64
In [137]:
# fillna() : 결측값을 원하는 값으로 채우고자 할 때
series['2025-11-10'] = np.nan
print(series)
2025-11-04 57.0 2025-11-05 85.0 2025-11-06 43.0 2025-11-07 85.0 2025-11-08 44.0 2025-11-09 33.0 2025-11-10 NaN 2025-11-11 48.0 2025-11-12 64.0 2025-11-13 17.0 Freq: D, dtype: float64
In [139]:
series = series.fillna(0)
print(series)
2025-11-04 57.0 2025-11-05 85.0 2025-11-06 43.0 2025-11-07 85.0 2025-11-08 44.0 2025-11-09 33.0 2025-11-10 0.0 2025-11-11 48.0 2025-11-12 64.0 2025-11-13 17.0 Freq: D, dtype: float64
In [140]:
# subset (부분집합)
series[3] = np.nan
series[8] = np.nan
print(series)
2025-11-04 57.0 2025-11-05 85.0 2025-11-06 43.0 2025-11-07 NaN 2025-11-08 44.0 2025-11-09 33.0 2025-11-10 0.0 2025-11-11 48.0 2025-11-12 NaN 2025-11-13 17.0 Freq: D, dtype: float64
In [144]:
# Quiz
# 원본데이터에서 결측값을 제외한 subset을 만들고 싶다면?
subset = series[pd.notnull(series)]
print(subset)
2025-11-04 57.0 2025-11-05 85.0 2025-11-06 43.0 2025-11-08 44.0 2025-11-09 33.0 2025-11-10 0.0 2025-11-11 48.0 2025-11-13 17.0 dtype: float64
In [146]:
'''
DataFrame(표 형식 - 행, 열) : .csv 을 읽어서 표로
- series는 인덱스만 있었지만
- 행 인덱스, 열 인덱스가 있다
- pd.DataFrame(data= ,columns= ,index= )
- dict, [[]] ,
'''
Out[146]:
'\nDataFrame(표 형식 - 행, 열) : .csv 을 읽어서 표로\n- series는 인덱스만 있었지만\n- 행 인덱스, 열 인덱스가 있다\n- pd.DataFrame(data= ,columns= ,index= )\n- dict, [[]] , \n'
In [157]:
frm = pd.DataFrame({
'feature01' : [1,2,3],
'feature02' : [1,2,3],
'feature03' : [1,2,3],
})
print(frm)
print('type - ', type(frm))
print('shape - ', frm.shape)
print('ndim - ', frm.ndim)
print('row idx - ', frm.index, type(frm.index))
print('col idx - ', frm.columns, type(frm.columns))
print('values - ', type(frm.values) )
print(frm.values)
feature01 feature02 feature03 0 1 1 1 1 2 2 2 2 3 3 3 type - <class 'pandas.core.frame.DataFrame'> shape - (3, 3) ndim - 2 row idx - RangeIndex(start=0, stop=3, step=1) <class 'pandas.core.indexes.range.RangeIndex'> col idx - Index(['feature01', 'feature02', 'feature03'], dtype='object') <class 'pandas.core.indexes.base.Index'> values - <class 'numpy.ndarray'> [[1 1 1] [2 2 2] [3 3 3]]
In [165]:
frm = pd.DataFrame(data =
[[1,2,3],
[1,2,3],
[1,2,3]],
columns = ['A', 'B', 'C'],
index = ['user_'+str(idx) for idx in range(3)])
frmInfo(frm)
type - <class 'pandas.core.frame.DataFrame'>
shape - (3, 3)
ndim - 2
row idx - Index(['user_0', 'user_1', 'user_2'], dtype='object') <class 'pandas.core.indexes.base.Index'>
col idx - Index(['A', 'B', 'C'], dtype='object') <class 'pandas.core.indexes.base.Index'>
values - <class 'numpy.ndarray'>
[[1 2 3]
[1 2 3]
[1 2 3]]
data -
A B C
user_0 1 2 3
user_1 1 2 3
user_2 1 2 3
In [176]:
print('전처리(pre-processing) - 열 이름 변경, 인덱스 변경, rename()')
frm.rename(columns = {'A':'name', 'B':'gender', 'C':'isMarraige'},
inplace = True,
index = {'user_1' : 'customer01'})
전처리(pre-processing) - 열 이름 변경, 인덱스 변경, rename()
In [179]:
frmInfo(frm)
type - <class 'pandas.core.frame.DataFrame'>
shape - (3, 3)
ndim - 2
row idx - Index(['user_0', 'customer01', 'user_2'], dtype='object') <class 'pandas.core.indexes.base.Index'>
col idx - Index(['name', 'gender', 'isMarraige'], dtype='object') <class 'pandas.core.indexes.base.Index'>
values - <class 'numpy.ndarray'>
[[1 2 3]
[1 2 3]
[1 2 3]]
data -
name gender isMarraige
user_0 1 2 3
customer01 1 2 3
user_2 1 2 3
In [186]:
print('데이터 추출 - indexing')
print('type - ', type(frm['name']))
print(frm['name'], type(frm['name']))
print(frm['name'].values, type(frm['name'].values))
데이터 추출 - indexing type - <class 'pandas.core.series.Series'> user_0 1 customer01 1 user_2 1 Name: name, dtype: int64 <class 'pandas.core.series.Series'> [1 1 1] <class 'numpy.ndarray'>
In [203]:
print('데이터 추가')
frm['age'] = [10,20,30]
데이터 추가
In [204]:
print(frm)
name gender isMarraige age user_0 1 2 3 10 customer01 1 2 3 20 user_2 1 2 3 30
In [205]:
print('삭제 - 열')
del frm['age']
삭제 - 열
In [206]:
print(frm)
name gender isMarraige user_0 1 2 3 customer01 1 2 3 user_2 1 2 3
In [221]:
print('만약 , 행 인덱싱을 하고 싶다면? - 정답 : 슬라이싱(정수, 문자)')
print(frm)
print()
print(frm[0:1])
print()
print(frm[ : 'customer01'])
만약 , 행 인덱싱을 하고 싶다면? - 정답 : 슬라이싱(정수, 문자)
name gender isMarraige
user_0 1 2 3
customer01 1 2 3
user_2 1 2 3
name gender isMarraige
user_0 1 2 3
name gender isMarraige
user_0 1 2 3
customer01 1 2 3
In [226]:
print(frm['name'][:'customer01'])
user_0 1 customer01 1 Name: name, dtype: int64
- json 데이터를 분석하기 위한 판다스 프레임으로
In [228]:
import urllib.request
import json
In [236]:
# sample json api
endPoint = 'https://jsonplaceholder.typicode.com/posts'
response = urllib.request.urlopen(endPoint)
print('response - ')
# print(response.read())
# json parsing
result = json.loads(response.read())
# print(result)
print('type - ', type(result))
print('keys - ', result[0].keys())
response - type - <class 'list'> keys - dict_keys(['userId', 'id', 'title', 'body'])
In [237]:
frm = pd.DataFrame(result)
In [269]:
#frmInfo(frm)
In [259]:
# Quiz
# https://dummyjson.com/carts
# 해당 사이트로부터 json 데이터를 불러와 각 장바구니의 userId, total, discountedTotal
# 그리고 포함된 각 product의 title, price, quantity
# 이걸 하나의 행으로 나타내는 frm 생성한다면?
import urllib.request
import json
endPoint = 'https://dummyjson.com/carts'
response = urllib.request.urlopen(endPoint)
result = json.loads(response.read())
carts = result['carts']
rows = []
for cart in carts:
for prod in cart['products']:
rows.append({
"userId" : cart["userId"],
"total" : cart["total"],
"discountedTotal" : cart["discountedTotal"],
"title" : prod["title"],
"price" : prod["price"],
"quantity" : prod["quantity"]
})
frm = pd.DataFrame(rows)
print(frm.head())
userId total discountedTotal title \
0 33 103774.85 89686.65 Charger SXT RWD
1 33 103774.85 89686.65 Apple MacBook Pro 14 Inch Space Grey
2 33 103774.85 89686.65 Green Oval Earring
3 33 103774.85 89686.65 Apple Airpods
4 142 4794.80 4288.95 Cricket Helmet
price quantity
0 32999.99 3
1 1999.99 2
2 24.99 5
3 129.99 5
4 44.99 4
In [252]:
endPoint = 'https://dummyjson.com/carts'
response = urllib.request.urlopen(endPoint)
result = json.loads(response.read())
carts = result['carts']
rows = []
for cart in carts:
userId = cart['userId']
total = cart['total']
discountedTotal = cart['discountedTotal']
for p in cart['products']:
rows.append({
'userId': userId,
'total': total,
'discountedTotal': discountedTotal,
'title': p['title'],
'price': p['price'],
'quantity': p['quantity']
})
frm = pd.DataFrame(rows)
print(frm)
userId total discountedTotal title \
0 33 103774.85 89686.65 Charger SXT RWD
1 33 103774.85 89686.65 Apple MacBook Pro 14 Inch Space Grey
2 33 103774.85 89686.65 Green Oval Earring
3 33 103774.85 89686.65 Apple Airpods
4 142 4794.80 4288.95 Cricket Helmet
.. ... ... ... ...
116 170 3862.43 3488.44 Volleyball
117 177 128249.07 118740.76 Marni Red & Black Suit
118 177 128249.07 118740.76 Pacifica Touring
119 177 128249.07 118740.76 Potatoes
120 177 128249.07 118740.76 Plant Pot
price quantity
0 32999.99 3
1 1999.99 2
2 24.99 5
3 129.99 5
4 44.99 4
.. ... ...
116 11.99 5
117 179.99 1
118 31999.99 4
119 2.29 4
120 14.99 4
[121 rows x 6 columns]
In [261]:
# 통계 분석, 탐색적 데이터 분석
# 브랜드별 평균 상품가 확인
frm.groupby('title')['price'].mean().sort_values(ascending=False).head()
Out[261]:
title Charger SXT RWD 32999.99 Pacifica Touring 31999.99 300 Touring 28999.99 Rolex Cellini Moonphase 15999.99 Rolex Submariner Watch 13999.99 Name: price, dtype: float64
In [264]:
# 사용자별 총 구매액
users = frm.groupby('userId')['total'].sum().sort_values(ascending=False).head()
In [265]:
import matplotlib.pyplot as plt
users.plot(kind='bar', figsize=(10,4), title="user purchase (USD)")
plt.show()
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: