'''
변수(variable) : 데이터를 담는 그릇 (숫자로 시작 불가, 특수문자 _ , $ 허용됨)
- 예약어는 변수로 사용할 수 없다.

Python Built-In Types
- Numeric (int, float)
- Sequence (list , tuple)
- Text Sequence (str)
- Set (set)
- Mapping (dict)
- Bool

다양한 변수 선언 방식
- Camel  Case : numberOfColleageGraduates - 변수, 함수
- Pascal Case : NumberOfColleageGraduates - 클래스
- Snake  Case : number_of_colleage_graduates - 비추천
'''

# 예약어 확인

import keyword
keywordList = keyword.kwlist

print('키워드 = ' , keywordList)
print()
print('type - ' , type(keywordList))

키워드 =  ['False', 'None', 'True', 'and', 'as', 'assert', 'async', 'await', 'break', 'class', 'continue', 'def', 'del', 'elif', 'else', 'except', 'finally', 'for', 'from', 'global', 'if', 'import', 'in', 'is', 'lambda', 'nonlocal', 'not', 'or', 'pass', 'raise', 'return', 'try', 'while', 'with', 'yield']

type -  <class 'list'>

# 기본 변수 바인딩

year = "2025"
month = 10

# 함수 호출
print(year, month)

# 변수타입 확인을 위한 함수 - type()
print('year type - ', type(year))

2025 10
year type -  <class 'str'>

# 문자열
name = "임정섭"
print(name, type(name))
print('010' , '1234' , '5678' , sep = "-")
print('jslim9413', 'naver.com' , sep="@")

임정섭 <class 'str'>
010-1234-5678
jslim9413@naver.com

# Sequence - List type
# [] - 순서가 있는 열거형 (파이썬에서는 Array가 아님, 파이썬에서는 배열이 존재하지 않음)
# indexing, slicing을 포함함

print('list     - ' , keywordList)
print()
print('indexing - ' , keywordList[0])
print()
print('slicing  - ' , keywordList[0:5])

list     -  ['False', 'None', 'True', 'and', 'as', 'assert', 'async', 'await', 'break', 'class', 'continue', 'def', 'del', 'elif', 'else', 'except', 'finally', 'for', 'from', 'global', 'if', 'import', 'in', 'is', 'lambda', 'nonlocal', 'not', 'or', 'pass', 'raise', 'return', 'try', 'while', 'with', 'yield']

indexing -  False

slicing  -  ['False', 'None', 'True', 'and', 'as']

#text sequence

print(name)
print('indexing - ' , name[0] , type(name[0]))
print('indexing - ' , name[-1])
print('slicing  - ' , name[0:2])

임정섭
indexing -  임 <class 'str'>
indexing -  섭
slicing  -  임정

# Mapping(key, value)
# dict , {}
# 인덱싱이 불가하고 key를 통해 데이터에 접근
log = {
    "SQL Injection" : {
        "정의" : "사용자 입력을 적절히 검증하지않고 데이터베이스 쿼리에 직접 포함하는 것",
        "용어" : [
            "입력검증", "파라미터화", "블라인드", "에러기반"
        ],
        "취약코드 예시" : "query = select * from table",
        "공격예시" : "user = admin or 1=1",
        "탐지" : [
            "웹 취약점 스캐너 사용", "비정상적인 쿼리", "오류 모니터링"
        ]
    }
}
print(log)
print()
print('type - ' , type(log))
print()
print(log["SQL Injection"]["용어"], type(log["SQL Injection"]["용어"]))
print()

# --error
# print(log[0][1], type(log[0][1]))

{'SQL Injection': {'정의': '사용자 입력을 적절히 검증하지않고 데이터베이스 쿼리에 직접 포함하는 것', '용어': ['입력검증', '파라미터화', '블라인드', '에러기반'], '취약코드 예시': 'query = select * from table', '공격예시': 'user = admin or 1=1', '탐지': ['웹 취약점 스캐너 사용', '비정상적인 쿼리', '오류 모니터링']}}

type -  <class 'dict'>

['입력검증', '파라미터화', '블라인드', '에러기반'] <class 'list'>

# 집합 (tuple)
# () 소괄호 사용
# indexing, slicing 가능
# 불변성(immutable)을 가진다 

tupleTemp = (1)
print('type - ', type(tupleTemp)) #type -  <class 'int'> int가 나온다 요소가 한개여서 정수로 봄

tupleTemp = (1,)
print('type - ', type(tupleTemp)) # ,를 사용해서 집합으로 만들어줘야한다

tupleTemp = "a", "b", "c", "d", "e"
print('data - ', tupleTemp)
print('type - ', type(tupleTemp)) # 소괄호에 대한 생략도 가능하다 (packing)

a, b, c, *d = tupleTemp
print('unpacking - ', a, b, c)

print()
print('indexing - ', tupleTemp[0])
print('slicing - ', tupleTemp[0:]) # start index : end index : step
print('slicing - ', tupleTemp[0::2])

# --error
# 불변성으로 데이터를 변경할 수 없다
# tupleTemp[0] = 23

type -  <class 'int'>
type -  <class 'tuple'>
data -  ('a', 'b', 'c', 'd', 'e')
type -  <class 'tuple'>
unpacking -  a b c

indexing -  a
slicing -  ('a', 'b', 'c', 'd', 'e')
slicing -  ('a', 'c', 'e')

# Text Sequence
# 문자열
# 인덱싱 슬라이싱 가능
strTemp = 'Talk is cheap. show me the code'
print(strTemp, type(strTemp))
print()
print('indexing - ', strTemp[0])

# --error
# strTemp[0] = 'L'

Talk is cheap. show me the code <class 'str'>

indexing -  T

print('Quiz - 마지막 문자열(code) 추출한다면 ?')
print(strTemp[-4:])

listTemp = strTemp.split() # 공백을 기준으로 나눔
print(listTemp)
print(type(listTemp))
print('result - ', listTemp[-1])

Quiz - 마지막 문자열(code) 추출한다면 ?
code
['Talk', 'is', 'cheap.', 'show', 'me', 'the', 'code']
<class 'list'>
result -  code

strEx = '홀짝홀짝홀짝홀짝홀짝홀짝홀짝홀짝홀짝홀짝'

print('Quiz - 주어진 변수에 홀 만 추출하여 출력하고 싶다면 ?')

print('result 홀 - ', strEx[::2])
print('result 짝 - ', strEx[1::2])

Quiz - 주어진 변수에 홀 만 추출하여 출력하고 싶다면 ?
result 홀 -  홀홀홀홀홀홀홀홀홀홀
result 짝 -  짝짝짝짝짝짝짝짝짝짝

# 집합(set) : 연산 (합집합, 교집합, 차집합, etc...)
# {}
# 순서가 보장되지 않는다.
# 중복을 보장하지 않는다.
# 인덱싱 슬라이싱 불가
# 변경가능
# {} 비어있는 중괄호는 dict로 인식 비어두고 싶다면 set()을 사용

userRoles = {"admin", "user", "user", "guest"}
print('data - ', userRoles)
print('type - ', type(userRoles))

data -  {'guest', 'admin', 'user'}
type -  <class 'set'>

empSet = {}
print('type - ', type(empSet))

empSet = set()
print(dir(empSet)) # magic method, 일반 method
print()
print('type - ', type(empSet))

# add method 데이터 추가
empSet.add("user")
empSet.add("admin")
empSet.add("guest")

print('data - ', empSet)

empSet.discard('guest')
print()
print('data - ', empSet)

type -  <class 'dict'>
['__and__', '__class__', '__class_getitem__', '__contains__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__iand__', '__init__', '__init_subclass__', '__ior__', '__isub__', '__iter__', '__ixor__', '__le__', '__len__', '__lt__', '__ne__', '__new__', '__or__', '__rand__', '__reduce__', '__reduce_ex__', '__repr__', '__ror__', '__rsub__', '__rxor__', '__setattr__', '__sizeof__', '__str__', '__sub__', '__subclasshook__', '__xor__', 'add', 'clear', 'copy', 'difference', 'difference_update', 'discard', 'intersection', 'intersection_update', 'isdisjoint', 'issubset', 'issuperset', 'pop', 'remove', 'symmetric_difference', 'symmetric_difference_update', 'union', 'update']

type -  <class 'set'>
data -  {'guest', 'admin', 'user'}

data -  {'admin', 'user'}

# 연산 (| , & , -, ^)
adminRoles = {"read", "write", "delete", "update"}
userRoles = {"read", "comment"}

print('type - ', type(adminRoles), type(userRoles))
print()
print('합집합 - ', adminRoles | userRoles)
print('교집합 - ', adminRoles & userRoles)
print('차집합 - ', adminRoles - userRoles)
print('대칭차집합 - ', adminRoles ^ userRoles)

type -  <class 'set'> <class 'set'>

합집합 -  {'comment', 'delete', 'update', 'write', 'read'}
교집합 -  {'read'}
차집합 -  {'write', 'delete', 'update'}
대칭차집합 -  {'comment', 'delete', 'update', 'write'}

# 보안관점에서 set() 활용 방안을 생각해 본다면?
# 중복로그인 감지 (토큰 또는 세션을 관리)

userTokens = set()

def login(token):
    if token in userTokens:
        raise ValueError(f"[보안경고] 이미 로그인 중 : {token}")
    userTokens.add(token)
    print(f"{token}로그인 성공")

login('jslim')
login('sk')
login('jslim')

jslim로그인 성공
sk로그인 성공

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[128], line 14
     12 login('jslim')
     13 login('sk')
---> 14 login('jslim')

Cell In[128], line 8, in login(token)
      6 def login(token):
      7     if token in userTokens:
----> 8         raise ValueError(f"[보안경고] 이미 로그인 중 : {token}")
      9     userTokens.add(token)
     10     print(f"{token}로그인 성공")

ValueError: [보안경고] 이미 로그인 중 : jslim

# 연산자(operator) : in , not in
fruits = ['apple', 'banana', 'durian', 'orange']
print('data - ', fruits)
print('type - ', type(fruits))
print('orange' in fruits)

data -  ['apple', 'banana', 'durian', 'orange']
type -  <class 'list'>
True

userInfo = {
    'name' : 'jslim',
    'age' : 20
}

print('name ', userInfo['name'])
print('age ', userInfo['age'])

name  jslim
age  20

role = "admin"
print('문자열 연결(+)', "Hello "+role+" !!")

문자열 연결(+) Hello admin !!

# f-string(3.6~)
# {} 안에 변수나 표현식을 넣을 수 있음
name = "sk"
score = 95.1234
print('data - ', score)
print('type - ', type(score))
print(f" name : {name} , score : {score:.2f}")

data -  95.1234
type -  <class 'float'>
 name : sk , score : 95.12

# str.format()
lang = 'python'
version = 3.13

print("language : {} , version : {} ".format(lang, version))
print("language : {0} , version : {1} ".format(lang, version))
print("language : {a} , version : {b} ".format(a=lang, b=version))

language : python , version : 3.13 
language : python , version : 3.13 
language : python , version : 3.13

# 서식 지정 연산자
# c 스타일 포맷 : %s, %d, %f, %.2f

# %.2fcm 단위 붙이기도 가능

print('language : %s , version : %.2fcm' % (lang, version))

language : python , version : 3.13cm

data = "Hello\nPython"
print(data)
print()
print(str(data))
print()
print(repr(data))

Hello
Python

Hello
Python

'Hello\nPython'

# 형변환 함수

strTemp = '100'
print('type - ', type(int(strTemp)))

type -  <class 'int'>

# 다중라인 출력

query = f"""select * 
        from table 
        where id = admin or 1 = 1"""
print(query)

msg = f"""
[user login Report]
===================
ID : sk
Time : 2025-10-27
STATUS : success
"""

print(msg)

select * 
        from table 
        where id = admin or 1 = 1

[user login Report]
===================
ID : sk
Time : 2025-10-27
STATUS : success

# 숫자 포맷
num = 1234567.8912

# Quiz : 천단위로 구분하고 싶다면
print(f"{num:,}")

# Quiz : 정렬
print(f"{num:>20.2f}") #오른쪽 정력 -> 20은 길이의 폭
print(f"{num:<20.2f}") #왼쪽 정렬
print(f"{num:020.2f}") #앞을 0으로 채워라

1,234,567.8912
          1234567.89
1234567.89          
00000000001234567.89

# 보안 관점에서 로그를 적용
# 이벤트로 수집된 데이터의 파싱을 용이하게 하기 위해서
# 탐지를 위한 규칙을 생성
# 변조된 데이터 식별을 쉽게하기 위함

loginUser = {
    'type' : 'guest',
    'ip'   : '192.168.0.10',
    'event': 'LOGIN_SUCCESS'
}

logMsg = f"[ALTER] User={loginUser["type"]} , IP = {loginUser["ip"]} , event = {loginUser["event"]}"
print(logMsg)

[ALTER] User=guest , IP = 192.168.0.10 , event = LOGIN_SUCCESS

logLst = []
print('type - ', type(logLst))
print()
print('dir - ', dir(logLst))
logLst.append(loginUser)
logLst.append(loginUser)
print()
print(logLst)

type -  <class 'list'>

dir -  ['__add__', '__class__', '__class_getitem__', '__contains__', '__delattr__', '__delitem__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__iadd__', '__imul__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__mul__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__rmul__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', 'append', 'clear', 'copy', 'count', 'extend', 'index', 'insert', 'pop', 'remove', 'reverse', 'sort']

[{'type': 'guest', 'ip': '192.168.0.10', 'event': 'LOGIN_SUCCESS'}, {'type': 'guest', 'ip': '192.168.0.10', 'event': 'LOGIN_SUCCESS'}]

logMsg = f"[ALTER] User={logLst[0]["type"]} , IP = {logLst[0]["ip"]} , event = {logLst[0]["event"]}"
print(logMsg)
logMsg = f"[ALTER] User={logLst[1]["type"]} , IP = {logLst[1]["ip"]} , event = {logLst[1]["event"]}"
print(logMsg)

[ALTER] User=guest , IP = 192.168.0.10 , event = LOGIN_SUCCESS
[ALTER] User=guest , IP = 192.168.0.10 , event = LOGIN_SUCCESS

# bool : 참(True) / 거짓(False)
# 조건문, 비교, 논리 연산자에서 사용됨
# semi bool : 1 / 0

isLoggedIn = True
hasPermission = False

print('castring - ', int(isLoggedIn), type(isLoggedIn))
print('castring - ', int(hasPermission))

print('Trusy, Falsy')
print('castring - ', bool(1))
print('castring - ', bool(0))
print('castring - ', bool('')) # '' 비어있으면 False
print('castring - ', bool('a')) # '' 들어있으면 True
print('castring - ', bool([]))
print('castring - ', bool([1,2,3]))

castring -  1 <class 'bool'>
castring -  0
Trusy, Falsy
castring -  True
castring -  False
castring -  False
castring -  True
castring -  False
castring -  True

url = 'http://www.naver.com'
print('type - ', type(url))

print('주어진 문자열에서 도메인만 추출하고 싶다면? ')
print('com - ', url[-3:])

print()
print('instance method - ', dir(url))
print()
print('find - ', url.find('com'))
print('com - ', url[url.find('com'):])

type -  <class 'str'>
주어진 문자열에서 도메인만 추출하고 싶다면? 
com -  com

instance method -  ['__add__', '__class__', '__contains__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getnewargs__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__mod__', '__mul__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__rmod__', '__rmul__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', 'capitalize', 'casefold', 'center', 'count', 'encode', 'endswith', 'expandtabs', 'find', 'format', 'format_map', 'index', 'isalnum', 'isalpha', 'isascii', 'isdecimal', 'isdigit', 'isidentifier', 'islower', 'isnumeric', 'isprintable', 'isspace', 'istitle', 'isupper', 'join', 'ljust', 'lower', 'lstrip', 'maketrans', 'partition', 'removeprefix', 'removesuffix', 'replace', 'rfind', 'rindex', 'rjust', 'rpartition', 'rsplit', 'rstrip', 'split', 'splitlines', 'startswith', 'strip', 'swapcase', 'title', 'translate', 'upper', 'zfill']

find -  17
com -  com

companyName = '   sk    '
print('type - ', type(companyName) , 'len - ', len(companyName))
print('strip - ', companyName.strip() , 'len - ', len(companyName.strip()))

type -  <class 'str'> len -  9
strip -  sk len -  2

companyName = 'samsung'
print('첫번째 문자를 대문자로 변경해서 출력 - ', companyName.capitalize())

첫번째 문자를 대문자로 변경해서 출력 -  Samsung

# Quiz
# 제공되는 문자열에서 파일의 확장자가 .xls 파일인지를 확인하고 싶다면?
fileName = 'report.xls'
print('flag - ', fileName.endswith(".xls"))
print('flag - ', fileName.rsplit('.', 1)[-1] == "xls")

flag -  True
flag -  True

# worker function
def printCoin():
    print('코인')

# caller
printCoin()

코인

def greet(name):
    return f"Hello~ , {name}"

result = greet('jslim')
print(result, type(result))

Hello~ , jslim <class 'str'>

lst = [1,2,3,4,5,6,7]
print('type - ', type(lst), bool(lst))
print()
print('max - ', max(lst))
print('min - ', min(lst))
print('sum - ', sum(lst))
print('mean - ', sum(lst) / len(lst) , type(sum(lst) / len(lst)))
print('mean - ', int(sum(lst) / len(lst)) , type(int(sum(lst) / len(lst)))) # 캐스팅 int를 지정

type -  <class 'list'> True

max -  7
min -  1
sum -  28
mean -  4.0 <class 'float'>
mean -  4 <class 'int'>

lstTmp01 = [1,2,3]
lstTmp02 = [1,2,3]

print('instance address - id() ', id(lstTmp01), id(lstTmp02))
print('is - 주소번지를 비교하는 연산자 ', lstTmp01 is lstTmp02)

lstTmp03 = lstTmp01
print('is - 주소번지를 비교하는 연산자 ', lstTmp01 is lstTmp03)

instance address - id()  2835138787328 2835153284480
is - 주소번지를 비교하는 연산자  False
is - 주소번지를 비교하는 연산자  True

from copy import copy, deepcopy

original = [[1,2],[3,4]]

shallowCopy = copy(original) # 얕은 복사
print('instance address - id() ', id(original[0]), id(shallowCopy[0]))
deepCopy = deepcopy(original) # 깊은 복사
print('instance address - id() ', id(original), id(deepCopy))

original[0][0] = 2
print('shallowCopy - ', shallowCopy) # 얕은 복사는 원본의 영향을 받는다
print('deepCopy    - ', deepCopy) # 깊은 복사는 원본의 영향을 받지 않는다

instance address - id()  2835158214592 2835158214592
instance address - id()  2835158209216 2835158135168
shallowCopy -  [[2, 2], [3, 4]]
deepCopy    -  [[1, 2], [3, 4]]

# 복사에 취약점
import copy
userData = {'id' : 1, 'token' : 'secret1234'}
cache = copy.copy(userData) # str 불변이라서 새로운 객체가 만들어짐

userData['token'] = 'secret4321'
print(userData['token'])
print(cache['token'])

secret4321
secret1234

import copy
userData = {'id' : 1, 'roles' : ['admin', 'user']}
cache = copy.copy(userData)

userData['roles'].append('guest') # list는 가변이기 때문에 객체를 그대로 사용 -> 원본의 변화에 영향을 받음
print(userData)
print(cache)

{'id': 1, 'roles': ['admin', 'user', 'guest']}
{'id': 1, 'roles': ['admin', 'user', 'guest']}

#rangeTmp = range(1, 11, 2)
#rangeTmp = range(11)
rangeTmp = range(1, 11)

print('data - ', rangeTmp, type(rangeTmp))
print('dir - ', dir(rangeTmp))

# for 변수 in 열거형 : (반복문)
for data in rangeTmp:
    print(data, end='\t')

data -  range(1, 11) <class 'range'>
dir -  ['__bool__', '__class__', '__contains__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', 'count', 'index', 'start', 'step', 'stop']
1	2	3	4	5	6	7	8	9	10

lst = [10, 20, 30]

for data in lst:
    print(data)

print()
for idx in range(len(lst)):
    print('idx - ', idx, 'data - ', lst[idx])

10
20
30

idx -  0 data -  10
idx -  1 data -  20
idx -  2 data -  30

#import random
from random import randint

# append() : 데이터를 0부터 추가
lst = []

for idx in range(10):
    lst.append(randint(1,5))
    
print('data - ', lst)
print()

# print('dir - ', dir(lst))
lst.sort()
print('sort - ', lst)
lst.sort(reverse = True)
print('sort - ', lst)

data -  [5, 5, 5, 4, 2, 1, 4, 4, 2, 4]

sort -  [1, 2, 2, 4, 4, 4, 4, 5, 5, 5]
sort -  [5, 5, 5, 4, 4, 4, 4, 2, 2, 1]

if 6 in lst:
    print('find - ')
else :
    print('not found')

not found

lst = [2,4,1,5,8,4]

for data in lst:
    print(data, end='\t')

print()
for idx in range(len(lst)):
    print(lst[idx], end='\t')

2	4	1	5	8	4	
2	4	1	5	8	4

# Quiz : 리스트에 있는 각각의 값을 제곱한 결과를 확인하고 싶다면?
# 연산자 : % (나머지)
for data in lst:
    print(data*data, end='\t')
print()
for idx in range(len(lst)):
    print(lst[idx]*lst[idx], end='\t')
print()

result = []
for idx in range(len(lst)):
    result.append(lst[idx]**2)
print(result)
print()

result = [lst[idx]**2 for idx in range(len(lst))]
print(result)
print()

# Quiz : 제곱한 결과에서 2의 배수인 값들만 추출하고 싶다면?
result = []
for idx in range(len(lst)):
    if (lst[idx]**2 % 2) == 0:
        result.append(lst[idx]**2)
print(result)
print()

result = [lst[idx]**2 for idx in range(len(lst)) if (lst[idx] % 2) == 0 ]
print(result)

4	16	1	25	64	16	
4	16	1	25	64	16	
[4, 16, 1, 25, 64, 16]

[4, 16, 1, 25, 64, 16]

[4, 16, 64, 16]

[4, 16, 64, 16]

# Quiz
# range() 객체를 이용해서 1 ~ 100 사이의 3의 배수만 List 에 담아서 출력한다면?

result = []
for idx in range(1,101):
    if (idx % 3) == 0:
        result.append(idx)
print('case01 - ', result)

print()

result = []
result = [ data for data in range(1,101) if (data % 3) == 0 ]
print('case02 - ', result)

case01 -  [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48, 51, 54, 57, 60, 63, 66, 69, 72, 75, 78, 81, 84, 87, 90, 93, 96, 99]

case02 -  [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48, 51, 54, 57, 60, 63, 66, 69, 72, 75, 78, 81, 84, 87, 90, 93, 96, 99]

# 사용자 정보를 저장

user = {
    "id" : 100,
    "name" : "admin",
    "role" : "superuser"
}
print(user)

# 권한 변경 함수
def updateRole(data , newRole):
    data["role"] = newRole
    
#caller
updateRole(user, "guest")
print(user)

{'id': 100, 'name': 'admin', 'role': 'superuser'}
{'id': 100, 'name': 'admin', 'role': 'guest'}

# 키 유무를 판단하고 싶다면?
# in : 조건에 맞는 논리값 반환
print("key 유무 판단 - ","id" in user)
print("key 유무 판단 - ","birth" in user)

print("key 통한 데이터 접근 - ", user["id"])
print("key 통한 데이터 접근 - ", user.get("id"))
user["address"] = "seoul"
print('data - ', user)

key 유무 판단 -  True
key 유무 판단 -  False
key 통한 데이터 접근 -  100
key 통한 데이터 접근 -  100
data -  {'id': 100, 'name': 'admin', 'role': 'superuser', 'address': 'seoul'}

# 사용자 정보를 저장
# 보안 관점에서 코드 개선 (불변 데이터 사용, 입력값 검증)

from copy import deepcopy

user = {
    "id" : 100,
    "name" : "admin",
    "role" : "superuser"
}
print(user)

# white list
allowedRoles = {"user", "guest", "manager"}

# 권한 변경 함수
def updateRole(data , newRole):
    copyUser = deepcopy(data)

    # 입력값 검증을 위한 조건 처리 (if ~ in)
    # if newRole in allowedRoles:
    #     data["role"] = newRole
        
    if newRole not in allowedRoles:
        print(f"허용되지 않는 변경 : {newRole}")
    else :
        copyUser["role"] = newRole
    
    return copyUser
    
#caller
changeUser = updateRole(user, "guest")
print('original - ' , user)
print('copy - ' , changeUser)

{'id': 100, 'name': 'admin', 'role': 'superuser'}
original -  {'id': 100, 'name': 'admin', 'role': 'superuser'}
copy -  {'id': 100, 'name': 'admin', 'role': 'guest'}

prodJson = {
    'melon' : {'price' : 100, 'qty' : 10},
    'bravo' : [200, 50], 
    'bibigo': [('price', 100),('qty' , 100)]
}
print(prodJson)

{'melon': {'price': 100, 'qty': 10}, 'bravo': [200, 50], 'bibigo': [('price', 100), ('qty', 100)]}

dictTmp = dict(city = 'busan', expo = 2030)
print(dictTmp)

{'city': 'busan', 'expo': 2030}

dictTmp = dict([
    ('city', 'busan'),
    ('expo', 2030)
])
print(dictTmp)

{'city': 'busan', 'expo': 2030}

keys = ('key01', 'key02', 'key03', 'key04')
datas = ('sk', 'samsung', 'lg', 'lgcns')

dictZip = dict(zip(keys, datas))
print(dictZip)
print()
#print(dir(dictZip))

for key in dictZip :
    print(key, dictZip[key])
    
print()
for data in dictZip.keys() :
    print(data)   

print()
for data in dictZip.values() :
    print(data)    

print()
for key, data in dictZip.items() :
    print(key, ' - ' ,data)

{'key01': 'sk', 'key02': 'samsung', 'key03': 'lg', 'key04': 'lgcns'}

key01 sk
key02 samsung
key03 lg
key04 lgcns

key01
key02
key03
key04

sk
samsung
lg
lgcns

key01  -  sk
key02  -  samsung
key03  -  lg
key04  -  lgcns

# Quiz
# 단어의 빈도수를 구한다면?
# 출력예시 { dog : 3 , cat : 4 , word : 1 , cs : 2 , sk : 2 }
# list 가지고 있는 매서드 : count
# dict 키의 중복을 허용하지 않는다는 점을 확인 : for ~ in(if)
wordLst = ['dog', 'dog', 'cat', 'cat', 'word', 'dog', 'cat', 'cs', 'cat', 'cs', 'sk', 'sk']

wordCount = {}

for w in wordLst:
    if w in wordCount:          
        wordCount[w] += 1       
    else:
        wordCount[w] = 1        

print(wordCount)

#print(dir(wordLst))
#print(wordLst.count('dog'))

#set(wordLst)
#[wordLst.count(data) for data in set(wordLst)]
result = dict(zip(set(wordLst),[wordLst.count(data) for data in set(wordLst)]))
print(result)

{'dog': 3, 'cat': 4, 'word': 1, 'cs': 2, 'sk': 2}
{'dog': 3, 'cs': 2, 'cat': 4, 'sk': 2, 'word': 1}

# Quiz
# 중복을 제거하고 유니크한 값을 출력하라

gender = ['남', '여', '여', '남', '남', '남', '여', '여']

setGender = set(gender)
print(setGender)

{'여', '남'}

# 개선된 코드 
print('case01 - ')
wordLst = ['dog', 'dog', 'cat', 'cat', 'word', 'dog', 'cat', 'cs', 'cat', 'cs', 'sk', 'sk'] 
unique  = sorted(set(wordLst))
freq    = [wordLst.count(word) for word in unique]
result  = dict(zip(unique, freq))

print(result)

print('case02 - ')
from collections import Counter
result = dict(Counter(wordLst))
print(result)

print('case03 - ')
result = {key: wordLst.count(key) for key in dict.fromkeys(wordLst)}
print(result)

case01 - 
{'cat': 4, 'cs': 2, 'dog': 3, 'sk': 2, 'word': 1}
case02 - 
{'dog': 3, 'cat': 4, 'word': 1, 'cs': 2, 'sk': 2}
case03 - 
{'dog': 3, 'cat': 4, 'word': 1, 'cs': 2, 'sk': 2}

if True :
    print('good')
else :
    print('bad')

good

score = input('점수를 입력하세요 : ')
print('type - ', type(score)) # 숫자를 입력했지만 str로 받아짐

score = int(input('점수를 입력하세요 : ')) # 형변환을 통해 int로 만들어주기
print('type - ', type(score))

type -  <class 'str'>

type -  <class 'int'>

score = int(input('점수를 입력하세요 : ')) 
print('type - ', type(score))
if score >= 90:
    if score >= 95:
        print('A+')
    else:
        print('A-')
    
elif score >= 80 :
    print('B')
elif score >= 70 :
    print('C')
elif score >= 60 :
    print('D')
else:
    print('F')

type -  <class 'int'>
A+

print('if ~ in')
areas = ['서울','경기','인천','부산']
region = input('지역을 입력하세요 : ')

if region in areas :
    print('응')
else :
    print(f'{region} 지역은 대상이 아닙니다.')

if ~ in

응

dictTmp = {'melon' : 100, 'bravo' : 200, 'bibibig' : 300}
print('키 존재 유무를 판단하고 싶다면 - ')
target = 'banana'
print('논리값 - ', target in dictTmp)
if target in dictTmp:
    print(dictTmp[target])
else :
    print(f'{target} 키는 대상이 아닙니다')

키 존재 유무를 판단하고 싶다면 - 
논리값 -  False
banana 키는 대상이 아닙니다

# Quiz
# 연산자 : % (나머지), == (동등비교), and (논리곱), or(논리합)
# 윤년 : 4의 배수이고 100의 배수가 아니거나 400의 배수일 때
# 요구사항) input 함수를 이용해서 년도를 입력받아 윤년인지 평년인지를 판단하고 싶다면?

year = int(input('년도를 입력하세요 : '))

if (year % 4 == 0 and year % 100 != 0) or (year % 400 == 0):
    print(f'{year} 윤년입니다')
else :
    print(f'{year} 평년입니다')

2000 윤년입니다

# Quiz
# input 함수를 이용해서 년도, 월을 입력받아서 월의 마지막 날을 출력한다면?

year = int(input('년도를 입력하세요 : '))
month = int(input('월를 입력하세요 : '))

dayLst = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
if (year % 4 == 0 and year % 100 != 0) or (year % 400 == 0):
    dayLst[1] = 29
print(f"{year}년 {month}월의 마지막 날은 {dayLst[month -1]}일 입니다")

2025년 10월의 마지막 날은 31일 입니다

from datetime import date

today = date.today()
print('type - ', type(today))
print()
print('dir - ', dir(today))
print()
print(today)
print(today.year, today.month, today.day, sep="-")

type -  <class 'datetime.date'>

dir -  ['__add__', '__class__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__ne__', '__new__', '__radd__', '__reduce__', '__reduce_ex__', '__replace__', '__repr__', '__rsub__', '__setattr__', '__sizeof__', '__str__', '__sub__', '__subclasshook__', 'ctime', 'day', 'fromisocalendar', 'fromisoformat', 'fromordinal', 'fromtimestamp', 'isocalendar', 'isoformat', 'isoweekday', 'max', 'min', 'month', 'replace', 'resolution', 'strftime', 'timetuple', 'today', 'toordinal', 'weekday', 'year']

2025-10-29
2025-10-29

from datetime import datetime

today = datetime.today()
print('type - ', type(today))
print()
print('dir - ', dir(today))
print()
print(today)
print(today.year, today.month, today.day, sep="-")

type -  <class 'datetime.datetime'>

dir -  ['__add__', '__class__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__ne__', '__new__', '__radd__', '__reduce__', '__reduce_ex__', '__replace__', '__repr__', '__rsub__', '__setattr__', '__sizeof__', '__str__', '__sub__', '__subclasshook__', 'astimezone', 'combine', 'ctime', 'date', 'day', 'dst', 'fold', 'fromisocalendar', 'fromisoformat', 'fromordinal', 'fromtimestamp', 'hour', 'isocalendar', 'isoformat', 'isoweekday', 'max', 'microsecond', 'min', 'minute', 'month', 'now', 'replace', 'resolution', 'second', 'strftime', 'strptime', 'time', 'timestamp', 'timetuple', 'timetz', 'today', 'toordinal', 'tzinfo', 'tzname', 'utcfromtimestamp', 'utcnow', 'utcoffset', 'utctimetuple', 'weekday', 'year']

2025-10-29 11:05:17.842037
2025-10-29

print('날짜에 대한 연산을 도와주는 모듈 dateutil')
from dateutil.relativedelta import relativedelta
from datetime import timedelta, date

today = date.today()
print('today - ', today)
print('하루 더 한다면?')

# -- 날짜 연산은 + 사용불가
# date 타입과 int타입으로 연산이 불가하기 떄문
# print(today + 1)

# timedelta에는 days만 있고 months나 year가 없다
day = timedelta(days = 1)
print('day - ', day)
print(today + day)

# relativedelta에는 months와 year가 있다
day = relativedelta(days = 1)
month = relativedelta(months = 1)
year = relativedelta(years = 1)

print(year , month , day)

today = today + year
today = today + month
today = today + day
print(today)

날짜에 대한 연산을 도와주는 모듈 dateutil
today -  2025-10-29
하루 더 한다면?
day -  1 day, 0:00:00
2025-10-30
relativedelta(years=+1) relativedelta(months=+1) relativedelta(days=+1)
2026-11-30

print('문자(날짜) -> 날짜')

strDate = '2025-10-29'
print()
today = datetime.strptime(strDate, '%Y-%m-%d')
print(today, type(today))

print()
print('날짜 -> 문자')
today = today.strftime('%Y-%m-%d')
print(today, type(today))

문자(날짜) -> 날짜

2025-10-29 00:00:00 <class 'datetime.datetime'>

날짜 -> 문자
2025-10-29 <class 'str'>

while 조건식 :
    실행문
    초기값에 증감

  Cell In[77], line 3
    초기값에 증감
         ^
SyntaxError: invalid syntax

startDay = datetime.strptime('2025-10-01', '%Y-%m-%d')
endDay = datetime.strptime('2025-10-10', '%Y-%m-%d')

# Quiz
# 날짜 리스트 생성
lst = []

currentDay = startDay
while currentDay <=endDay :
    lst.append(currentDay.strftime('%Y-%m-%d'))
    currentDay += timedelta(days=1)
print(lst)

['2025-10-01', '2025-10-02', '2025-10-03', '2025-10-04', '2025-10-05', '2025-10-06', '2025-10-07', '2025-10-08', '2025-10-09', '2025-10-10']

a = 10
b = 20

maxValue = a if a > b else b
print(maxValue)

age = 20
status = "성인" if age >= 19 else "미성년자"
print(status)

20
성인

# Quiz
# 리스트에서 삼항 연산자
# 짝수/홀수 표시한다면?
import random
lst = []
for idx in range(10) :
    nan = random.randint(1,100)
    lst.append(nan)    
print(lst)

result = ["짝수" if data %2 == 0 else "홀수" for data in lst]
print(result)

[44, 17, 27, 80, 65, 23, 34, 99, 85, 40]
['짝수', '홀수', '홀수', '짝수', '홀수', '홀수', '짝수', '홀수', '홀수', '짝수']

# Quiz
# input 함수를 이용해서 핸드폰 번호를 입력 011-xxxx-xxxx , 016-xxxx-xxxx, 019-xxxx-xxxx
# 삼항연산자를 이용해서 통신사의 정보를 확인하고 싶다면

num = input("핸드폰 번호를 입력해 주세요 : ")

result = (
    "SK" if num.split('-')[0] == "011" else
    "LG" if num.split('-')[0] == "016" else
    "KT" if num.split('-')[0] == "019" else
    "error"
)

print("통신사 :", result)

통신사 : SK

# Quiz
# 주민등록번호의 뒷자리 7자리 중 두번째와 세번째는 지역코드
# 지역코드 출생지
# 00 ~ 08 seoul
# 09 ~ 12 busan
# input 함수를 이용해서 주민번호를 입력 xxxxxx-x05xxxx 출생지가 서울인지 아닌지를 판단하고 싶다면?
num = input("주민번호를 입력해 주세요 : ")
print('case01 - ')
region = int(num.split('-')[1][1:3]) 
result = ( "서울 출생" if 0 <= region <= 8 else "기타지역" )
print(result)

print('case02 - ')
seoulCity = ["00","01","02","03","04","05","06","07","08",]
if num.split('-')[1][1:3] in seoulCity:
    print('서울')
else :
    print('서울 외')

print('case03 - ')
print('서울' if num.split('-')[1][1:3] in seoulCity else '서울 외')

case01 - 
서울 출생
case02 - 
서울
case03 - 
서울

# Quiz
# input 함수를 이용해서 시간을 입력받는다
# 입력된 시간이 정각인지 아닌지를 판단하고 싶다면?
# 입력예시) 12:00, 03:10

time = input("시간을 입력해 주세요 : ")
print("정각" if time.split(':')[1] == "00" else "정각아님")

정각

print(3+4 > 7+3)
print(5 + 10 > 3 and 7+3 == 10)

print((3+4) > (7+3))
print(((5 + 10) > 3) and ((7+3) == 10))

False
True
False
True

for data in range(10):
    print(data)

msg = 'see u next time'
for char in msg:
    print(char, end=' ')
print()
for idx in range(len(msg)):
    print(msg[idx], end=' ')
print()
tupleTmp = (4, 6, 1, 3)
for idx, data in enumerate(tupleTmp):
    print('idx - ', idx , 'data - ', data)

s e e   u   n e x t   t i m e 
s e e   u   n e x t   t i m e 
idx -  0 data -  4
idx -  1 data -  6
idx -  2 data -  1
idx -  3 data -  3

wordLst = ['dog', 'dog', 'cat', 'cat', 'word', 'dog', 'cat', 'cs', 'cat', 'cs', 'sk', 'sk'] 
result = {}
for data in wordLst :
    if data in result:
        result[data] += 1
    else :
        result[data] = 1

print(result)

{'dog': 3, 'cat': 4, 'word': 1, 'cs': 2, 'sk': 2}

print('guess game - ')
print('1 ~ 100 사이의 난수를 생성하고 숫자를 맞춰보는 게임')

from random import randint
answer = randint(1, 100)
print('answer - ', answer)

guess game - 
1 ~ 100 사이의 난수를 생성하고 숫자를 맞춰보는 게임
answer -  80

# Quiz
# input 함수를 이용해서 1 ~ 100 사이의 값을 입력받아
# 정답보다 크면 down, 정답보다 작으면 up
# 횟수 10 번으로
# 최종적인 출력예시) 정답 {}, 횟수 {}

from random import randint
answer = randint(1, 100)

for time in range(10):
    a = int(input('숫자를 입력해 주세요 : '))
    if a > answer:
        print('down')
        print(f'남은 횟수 : {9-time}')
    elif a < answer:
        print('up')
        print(f'남은 횟수 : {9-time}')
    else:
        break
print(f'정답 {answer} , 횟수 {time+1}')

up
남은 횟수 : 9

up
남은 횟수 : 8

up
남은 횟수 : 7

down
남은 횟수 : 6

up
남은 횟수 : 5

up
남은 횟수 : 4

정답 84 , 횟수 7

# Quiz
# input 함수를 이용해서 1 ~ 100 사이의 값을 입력받아
# 정답보다 크면 down, 정답보다 작으면 up
# 횟수 10 번으로
# 최종적인 출력예시) 정답 {}, 횟수 {}

from random import randint
answer = randint(1, 100)
print('answer - ', answer)
tries = 1
for idx in range(10):
    guess = int(input('1 ~ 100 사이의 숫자를 입력하세요 : '))
    if answer == guess :
        print('정답')
        break
    elif answer > guess :
        print('up')
    else :
        print('down')
    tries += 1

if answer == guess :
    print(f'정답 {answer} , 시도 횟수 {tries}')
else :
    print(f'정답 {answer} , 제공되는 기회를 전부 사용하였습니다.')

answer -  89

up

up

정답
정답 89 , 시도 횟수 3

# while로 변경해보기

from random import randint
answer = randint(1, 100)
print('answer - ', answer)
tries = 1

while tries <= 10:
    guess = int(input('1 ~ 100 사이의 숫자를 입력하세요 : '))
    if answer == guess :
        print('정답')
        break
    elif answer > guess :
        print('up')
    else :
        print('down')
    tries += 1

if answer == guess :
    print(f'정답 {answer} , 시도 횟수 {tries}')
else :
    print(f'정답 {answer} , 제공되는 기회를 전부 사용하였습니다.')

answer -  7

up

up

up

up

up

up

up

up

up

# Quiz
# input 함수를 이용해서 1 ~ 100 사이의 값을 입력받아
# 정답보다 크면 down, 정답보다 작으면 up
# 횟수 10 번으로
# 최종적인 출력예시) 정답 {}, 횟수 {}

from random import randint
answer = randint(1, 100)
print('answer - ', answer)
tries = 1
for idx in range(10):
    guess = int(input('1 ~ 100 사이의 숫자를 입력하세요 : '))
    if answer == guess :
        print(f'정답 {answer} , 시도 횟수 {tries}')
        break
    elif answer > guess :
        print('up')
    else :
        print('down')
    tries += 1

else :
    print(f'정답 {answer} , 제공되는 기회를 전부 사용하였습니다.')

answer -  44

up

up

up

up

up

up

up

up

up

# while로 변경해보기

from random import randint
answer = randint(1, 100)
print('answer - ', answer)
tries = 1

while tries <= 10:
    guess = int(input('1 ~ 100 사이의 숫자를 입력하세요 : '))
    if answer == guess :
        print(f'정답 {answer} , 시도 횟수 {tries}')
        break
    elif answer > guess :
        print('up')
    else :
        print('down')
    tries += 1

else :
    print(f'정답 {answer} , 제공되는 기회를 전부 사용하였습니다.')

answer -  24

up

up

up

up

up

up

up

up

up

# Quiz 
# 4년에 한번씩 열리는 올림픽이 있다(2024)
# 향후 50년동안 열리는 올림픽의 년도를 출력
# 한 줄에 년도를 5개씩만 출력
# hint) escape sequence : \n, \t


cnt = 0
for year in range(2024, 2074, 4):
    cnt += 1
    if cnt % 5 == 0:
        print(year, end = '\n')
    else :
        print(year, end = '\t')

2024	2028	2032	2036	2040
2044	2048	2052	2056	2060
2064	2068	2072

dan = int(input("단을 입력하세요 : "))
for gu in range(1, 10):
    print(f'{dan} * {gu} = {dan*gu}')

9 * 1 = 9
9 * 2 = 18
9 * 3 = 27
9 * 4 = 36
9 * 5 = 45
9 * 6 = 54
9 * 7 = 63
9 * 8 = 72
9 * 9 = 81

print('구구단 - ')

for row in range(2, 10) :
    if row == 5 :
        continue
    for col in range(1, 10) : 
        print(f'{row} * {col} = {row*col}', end = "\t")
    print()

    # if row == 3 :
    #     break

구구단 - 
2 * 1 = 2	2 * 2 = 4	2 * 3 = 6	2 * 4 = 8	2 * 5 = 10	2 * 6 = 12	2 * 7 = 14	2 * 8 = 16	2 * 9 = 18	
3 * 1 = 3	3 * 2 = 6	3 * 3 = 9	3 * 4 = 12	3 * 5 = 15	3 * 6 = 18	3 * 7 = 21	3 * 8 = 24	3 * 9 = 27	
4 * 1 = 4	4 * 2 = 8	4 * 3 = 12	4 * 4 = 16	4 * 5 = 20	4 * 6 = 24	4 * 7 = 28	4 * 8 = 32	4 * 9 = 36	
6 * 1 = 6	6 * 2 = 12	6 * 3 = 18	6 * 4 = 24	6 * 5 = 30	6 * 6 = 36	6 * 7 = 42	6 * 8 = 48	6 * 9 = 54	
7 * 1 = 7	7 * 2 = 14	7 * 3 = 21	7 * 4 = 28	7 * 5 = 35	7 * 6 = 42	7 * 7 = 49	7 * 8 = 56	7 * 9 = 63	
8 * 1 = 8	8 * 2 = 16	8 * 3 = 24	8 * 4 = 32	8 * 5 = 40	8 * 6 = 48	8 * 7 = 56	8 * 8 = 64	8 * 9 = 72	
9 * 1 = 9	9 * 2 = 18	9 * 3 = 27	9 * 4 = 36	9 * 5 = 45	9 * 6 = 54	9 * 7 = 63	9 * 8 = 72	9 * 9 = 81

print('working - ')
print('default parameter - ')
def greet(name='guest'):
    '''사용자에게 인사하는 함수'''
    return (f'hi ~, {name}')

working - 
default parameter -

print('caller - ')
result = greet()
print(result)

caller - 
hi ~, guest

def userAdd(a,b):
    return a ** b

print('positional argument - ')
result = userAdd(3,2)
print('type - ', type(result))
print('result - ', result)

positional argument - 
type -  <class 'int'>
result -  9

print('keyword argument - ')
result = userAdd(b=3,a=2)
print('type - ', type(result))
print('result - ', result)

keyword argument - 
type -  <class 'int'>
result -  8

add = lambda x, y : x + y
print('type - ', type(add))

type -  <class 'function'>

result = add(3, 5)
print('result - ', result)

result -  8

lst = [1,2,3,4,5,6,7,8]

print('case01 - ')
result = [data ** 2 for data in lst]
print(result)

print('case02 - ')
print('map - 열거형 타입(반복 가능한 요소에 대해서 특정 함수를 적용하고 그 결과를 map 객체로 반환)')
print('map(function, iterable)')
double = list(map(lambda data : data ** 2 , lst ))
print(double)

print()
print('case03 - ')
print('filter(function, iterable) ')
even = list(filter(lambda data : data % 2 == 0 , lst ))
print(even)

case01 - 
[1, 4, 9, 16, 25, 36, 49, 64]
case02 - 
map - 열거형 타입(반복 가능한 요소에 대해서 특정 함수를 적용하고 그 결과를 map 객체로 반환)
map(function, iterable)
[1, 4, 9, 16, 25, 36, 49, 64]

case03 - 
filter(function, iterable) 
[2, 4, 6, 8]

wordLst = ['pineApple', 'cherry', 'watermelon', 'banana', 'apple']
result = sorted(wordLst, key=lambda x : len(x))
print(result)

['apple', 'cherry', 'banana', 'pineApple', 'watermelon']

# 반환예시) [ 'www.skshielus.com', 'www.samsung.com' , etc .....]
def makeUrl(lst : list) -> list :
    #return [f'www.{data}.com' for data in lst]
    return list(map(lambda x: 'www.'+x+'.com', lst))

companyLst = ['skshieldus', 'samsung', 'lgcns', 'skcnc', '<script>alert(1)</script>']
urls = makeUrl(companyLst)
print(urls)

['www.skshieldus.com', 'www.samsung.com', 'www.lgcns.com', 'www.skcnc.com', 'www.<script>alert(1)</script>.com']

# 코드의 개선점
# 내부값이 문자열인지, 악성문자열 검증
# 웹 페이지에 출력하면 XSS 취약점 발생
# quote() : 공백, 특수문자 등을 안전하게 인코딩

from urllib.parse import quote

# def safemakeUrl(lst : list) -> list :
#     return list(map(lambda x: 'www.'+quote(x)+'.com', lst))

# isinstance 입력받은 lst 타입이 list 타입인지
def safemakeUrl(lst : list) -> list :
    if not isinstance(lst, list):
        raise TypeError("입력은 리스트 타입으로 전달 부탁드립니다.")
    if not all(isinstance(name, str) for name in lst) :
        raise ValueError("요소의 타입은 문자열")
    # 안전하게 URL 인코딩
    return list(map(lambda x: 'www.'+quote(x)+'.com', lst))

companyLst = ['skshieldus', 'samsung', 'lgcns', 'skcnc', '<script>alert(1)</script>']
urls = safemakeUrl(companyLst)
print(urls)

['www.skshieldus.com', 'www.samsung.com', 'www.lgcns.com', 'www.skcnc.com', 'www.%3Cscript%3Ealert%281%29%3C/script%3E.com']

def myColor(name, color, isLike : bool) :
    print(f'{name}님은 {color}색을 {'좋아' if isLike else '싫어'}합니다.')

# 출력예시) XXX님은 xxx색을 좋아(싫어)합니다.
myColor('섭섭님', 'red', False)
myColor('섭섭님', 'blue', True)

섭섭님님은 red색을 싫어합니다.
섭섭님님은 blue색을 좋아합니다.

# 전역변수
globalVar = 10 

def outer() :
    # 외부 함수 변수
    enclosingVar = 20
    def inner() :
        # 지역변수
        localVar = 30
        print('global - ', globalVar)
        print('enclosingVar - ', enclosingVar)
        print('localVar - ', localVar)
    inner()
    # error
    # print(localVar)

outer()
print('globalVar - ', globalVar)
# error
# print(enclosingVar)

global -  10
enclosingVar -  20
localVar -  30
globalVar -  10

x = 10
def test() :
    x = 20

test()
print('x - ', x)

x -  10

cnt = 0

def increment () :
    global cnt
    cnt += 1

increment()
increment()
increment()
print('cnt - ', cnt)

cnt -  3

def outer() :
    outerVar = 10
    def inner() :
        nonlocal outerVar # nonlocal 외부 함수를 호출 가능하게 해줌
        outerVar += 1
        print('inner outerVar : ', outerVar)
    inner()
    print('outer outerVar : ', outerVar)

outer()

inner outerVar :  11
outer outerVar :  11

def outer(name : str):
    def inner():
        return f'hi ~ , {name}'
    return inner # 함수반환

msg = outer('jslim')
print('type - ', type(msg))
print('result - ', msg())

type -  <class 'function'>
result -  hi ~ , jslim

# 전역변수를 사용하지 않고 상태를 유지하는 코드 개선

def counter():
    count = 0
    def increase():
        nonlocal count
        count += 1
        return count
    return increase

cnt = counter()
print(cnt())
print(cnt())
print(cnt())
print(cnt())
print(cnt())

def commonChecking():
    print('>>>>>>>>> permission check')
def commonLogging() :
    print('>>>>>>>>> log......')

def decorator(checkingFunc, loggingFunc):
    def logic():
        checkingFunc()
        print('업무로직')
        loggingFunc()
    return logic

innerLogic = decorator(commonChecking, commonLogging)
innerLogic()

>>>>>>>>> permission check
업무로직
>>>>>>>>> log......

def clo(num):
    x = 10
    def test():
        nonlocal x
        x = x + num
        return x
    return test

a = clo(1)
print(a())
print(a())
print(a())

11
12
13

print('실행시간 성능 로그 - ')
from time import time, sleep

def timer(func): 
    def wrapper(*args, **kwargs):
        start = time()
        result = func(*args, **kwargs)
        elapsed = time() - start
        if elapsed > 2 :
            print(f'경고 : {func.__name__}, 실행시간 : {elapsed:.2f}초 초과되어서 알림')
        return result
    return wrapper

def timerFunc():
    sleep(1)
    return '성능확인'

실행시간 성능 로그 -

# 함수 장식자
@timer
def timerFunc():
    sleep(1)
    return '성능확인'

inner = timerFunc()
print(inner)

성능확인

def timerFunc():
    sleep(1)
    return '성능확인'

inner = timer(timerFunc)
print(inner())

성능확인

def variableLenArgs(*args : int) -> None :
    print('type - ', type(args))
    total = sum(args)
    return total

result = variableLenArgs(1,2,3)
print(result)
result = variableLenArgs(1,2,3,4,5)
print(result)
result = variableLenArgs(1,2,3,4,5,6,7)
print(result)

type -  <class 'tuple'>
6
type -  <class 'tuple'>
15
type -  <class 'tuple'>
28

def variableLenArgsDict(**args) :
    print('type - ', type(args))
    for key, value in args.items():
        print(key,value)

variableLenArgsDict(name='jslim',age=30,region='seoul')

type -  <class 'dict'>
name jslim
age 30
region seoul

def variableLenArgsMix(subject, *args, **kwargs):
    print('subject - ', subject)
    print('args - ', args)
    print('kwargs - ', kwargs)

variableLenArgsMix('사용자 정보 ', '임섭순', '섭섭해', '쉴더스', a=1,b=2)

subject -  사용자 정보 
args -  ('임섭순', '섭섭해', '쉴더스')
kwargs -  {'a': 1, 'b': 2}

# endpoint
# Quiz
# 아래와 같은 api 요청을 생성
# https://api.v1.example.com/search?q=secure&page=2
def makeApiRequest(endpoint, **params) :
    query = "&".join([f"{k}={v}" for k, v in params.items()])
    print(f'{endpoint}?{query}')
    return (f'{endpoint}?{query}')

api = makeApiRequest('https://api.v1.example.com/search', q='secure', page=2)
print(api)

https://api.v1.example.com/search?q=secure&page=2
https://api.v1.example.com/search?q=secure&page=2

def makeApiRequest(endpoint, **params):
    # params를 key=value 형태로 연결
    query = "&".join([f"{k}={v}" for k, v in params.items()])
    url = f"{endpoint}?{query}"
    print(url)
    return url

def makeApiRequest(endpoint, **params):
    query = ""
    for k, v in params.items():
        if query == "":
            query += f"{k}={v}"
        else:
            query += f"&{k}={v}"
    url = endpoint + "?" + query
    print(url)
    return url

api = makeApiRequest('https://api.v1.example.com/search', q='secure', page=2)
print(api)

https://api.v1.example.com/search?q=secure&page=2
https://api.v1.example.com/search?q=secure&page=2

from urllib.parse import urlencode
params = {'q' : 'secure', 'page' : 2} 
queryString = urlencode(params)
print(queryString)

q=secure&page=2

from urllib.parse import urlencode
def makeApiRequest(endpoint, **params) :
    queryString = urlencode(params)
    return endpoint+'?'+queryString

api = makeApiRequest('https://api.v1.example.com/search', q='secure', page=2)
print(api)

https://api.v1.example.com/search?q=secure&page=2

print('보안 - 코드 개선')
from urllib.parse import urlencode
def makeApiRequest(endpoint, **params) :
    
    whiteLst = {'q', 'page', 'lang'}
    safeParams = { k : v for k, v in params.items() if k in whiteLst}
    queryString = urlencode(safeParams)
    return endpoint+'?'+queryString

보안 - 코드 개선

api = makeApiRequest('https://api.v1.example.com/search', q='secure', page=2)
print(api)

https://api.v1.example.com/search?q=secure&page=2

lst = [1,2,3] 

try :
    for idx in range(len(lst)+1):
        print(lst[idx])
except IndexError as e :
    print(f'{e} 예외 발생 함')
else :
    print('예외가 발생하지 않았을 때 수행하는 블럭')
finally :
    print('예외발생 여부와 상관없이 수행하는 블럭')
print('>>>> 정상종료')

1
2
3
list index out of range 예외 발생 함
예외발생 여부와 상관없이 수행하는 블럭
>>>> 정상종료

def getUserInfo():
    ''' 사용자의 정보를 입력받고, 형식 오류나 유효성을 처리하는 기능'''
    try:
        name = input('이름을 입력하세요 : ').strip()
        if not name.isalpha():
            raise ValueError('이름은 문자만 포함해야 합니다!!')
            
        age = input('나이를 입력하세요 : ').strip()
        if not age.isdigit():
            raise ValueError('나이는 숫자만 입력하세요!!')
        age = int(age)
        if age < 0 or age > 120:
            raise ValueError('1~120 사이의 숫자만 입력하세요!!')
            
        email = input('이메일 주소를 입력하세요 : ').strip()
        if '@' not in email or '.' not in email :
            raise ValueError('올바를 이메일 형식이 아닙니다!!')
        
        
    except Exception as ve:
        print(f'입력오류 {ve}')
    else :
        return {
            "name" : name,
            "age" : age,
            "email" : email
        }
    finally :
        print('입력정보를 정상적으로 처리 하였습니다!!')

result = getUserInfo()
print(result)

입력오류 이름은 문자만 포함해야 합니다!!
입력정보를 정상적으로 처리 하였습니다!!
None

# Quiz
# 예외처리 관점, 보안적 관점
# 매개변수로 전달받은 리스트타입 요소의 값을 거듭제곱해서 반환하여 출력하고자 한다

def lstPrt(lst: list) -> list:
    try:
        
        for idx in range(len(lst)):
            value = lst[idx]

            value = str(value)

            if not value.isdigit():
                raise ValueError(f'{value}는 숫자가 아닙니다!')
            value = int(value)
                
            print(f'{value ** 2}')

    except Exception as e:
        print(f'예외 발생: {e}')
    else:
        print('예외가 발생하지 않았을 때 수행하는 블럭')
    finally:
        print('예외 발생 여부와 상관없이 수행하는 블럭')
    print('>>>> 정상종료')

# Quiz
# 예외처리 관점 (요소의 타입), 보안적 관점(매개변수의 타입)
# 매개변수로 전달받은 리스트타입 요소의 값을 거듭제곱해서 반환하여 출력하고자 한다

# def lstPrt(lst: list) -> list:
#     if not isinstance(lst, list):
#         raise TypeError('매개변수 타입은 반드시 list 형태')
    
#     result = []
#     for data in lst:
#         if not isinstance(data, (int, float)):
#             print('숫자가 아닌 값이 포함 됨')
#             continue
#         else :
#             result.append(data ** 2)        
#     return result

import logging
# level
# debug, info, warning, error, critical
# format (%s -%s etc....)
# format key : asctime, levelname, name, message, etc .....z

logging.basicConfig(level=logging.WARNING, 
                    format="%(asctime)s - %(levelname)s - %(message)s", force=True)

# 예외, 보안, 로깅
def lstPrt(lst: list) -> list:
    
    # 보안관점에서 입력검증
    if not isinstance(lst, list):
        raise TypeError('매개변수 타입은 반드시 list 형태')
    
    result = []
    for idx, data in enumerate(lst):
        if not isinstance(data, (int, float)):
            logging.warning(f'{idx}에 숫자가 아닌 값이 포함된 타입 {type(data).__name__}')
            continue
        try :
            result.append(data ** 2)
        except Exception as e:
            logging.error(f'예상치 못한 예외 발생 : {e}')
    return result

tmp = [10,20,30,40,'seop',50,60]
result = lstPrt(tmp)
print(result)

2025-10-31 14:24:25,296 - WARNING - 4에 숫자가 아닌 값이 포함된 타입 str

[100, 400, 900, 1600, 2500, 3600]

# 에러메시지 노출 - print()
# 서버환경 print() -> 로깅(level=warning/info/error) 일반화된 메시지 제공이 안전
# import logging
import logging
logging.basicConfig(level=logging.WARNING)

data = 'jslim'
try:
    print(data ** 2)
except TypeError as t:
    logging.warning(f'숫자가 아닌값을 발견 : {data}')

WARNING:root:숫자가 아닌값을 발견 : jslim

import logging
# level
# debug, info, warning, error, critical
# format (%s -%s etc....)
# format key : asctime, levelname, name, message, etc .....z

logging.basicConfig(level=logging.WARNING, 
                    format="%(asctime)s - %(levelname)s - %(message)s",
                   filename='shieldus.log',
                   filemode='a')

import logging
# level
# debug, info, warning, error, critical
# format (%s -%s etc....)
# format key : asctime, levelname, name, message, etc .....z

logging.basicConfig(level=logging.WARNING, 
                    format="%(asctime)s - %(levelname)s - %(message)s", force=True)

filePath = './data/greeting.txt'.strip()
try :
    file = open(filePath, mode='r', encoding='utf-8')
except:
    logging.error(f'파일을 열 수 없습니다.')
    
print('type - ', type(file))
#print('dir - ', dir(file))
print(file.read(), type(file.read())) # str
# print(file.readlines(), type(file.readlines())) # list
file.close()

type -  <class '_io.TextIOWrapper'>
강사님과 함께하는 즐겁지 아니하지 아니한 파이썬수업
그렇지만 열공하자
오늘은 즐거운 금요일
불금인데.....방콕이 답이다 <class 'str'>

# with open은 file.close()를 포함하고 있음 안써도 됨

filePath = './data/greeting.txt'.strip()
with open(filePath, mode='r', encoding='utf-8') as file :
    lst = file.readlines()
    for txt in lst:
        print(txt.strip('\n'))

강사님과 함께하는 즐겁지 아니하지 아니한 파이썬수업
그렇지만 열공하자
오늘은 즐거운 금요일
불금인데.....방콕이 답이다

# data = '안녕하세요~ 한 주 수고많으셨구요... 즐거운 금요일 되시길 바랍니다.'
data = {'id' : 'xxxx' , 'pwd' : 'xxxx'}
print('type - ', type(data))

# dict -> json , json -> dict
import json

filePath = './data/msg.json'.strip()
with open(filePath, mode='w', encoding='utf-8') as file :
    # file.write(data)
    json.dump(data, file)

type -  <class 'dict'>

print('json 형식의 파일을 읽어들인다면? - ')
import json

filePath = './data/msg.json'.strip()
with open(filePath, mode='r', encoding='utf-8') as file :
    # file.write(data)
    loadData = json.load(file)
    print(loadData, '-', type(loadData))
    print(loadData['id'])

json 형식의 파일을 읽어들인다면? - 
{'id': 'xxxx', 'pwd': 'xxxx'} - <class 'dict'>
xxxx

# Quiz
# 사용자의 이름을 두자리만 출력하고 나머지는 마스킹처리(su********)
# 로깅포맷 
# 로깅포맷형태를 파일로 저장 userAccess.log
# decorator 이용해서 (보안적인 측면을 강화)
# 예외관련해서 필요한 부분이 있다면 추가

from datetime import datetime
import logging

logging.basicConfig(level=logging.WARNING, 
                    format="%(asctime)s - %(levelname)s - %(message)s", force=True)

filePath = './data/userAccess.log'.strip()
def getProfile(user : dict) -> None :
    try :
        if not isinstance(user, dict):
            raise TypeError('매개변수 타입은 반드시 dict 형태')
            
        start = datetime.today()
        name = user['name'] 
        mask = name[:2] + '*' * (len(name) - 2)
        end = datetime.today()
        time = (end - start)
        data = (f'검색한 시간 - {start}, 사용자 - {mask}, 실행시간 정보 - {time}')
        with open(filePath, mode='w', encoding='utf-8') as file :
            file.write(data)
        print(data)
    except Exception as e:
        logging.error(f'예상치 못한 예외 발생 : {e}')

# caller
user = {'name' : 'superadmin', 'authenticated' : True}
# 관리자가 사용자 정보를 확인하고 한다.
# 관리자가 사용자의 정보를 검색한 시간, 사용자이름, 실행시간 정보를 userAccess.log 파일로 저장하고 싶다면
getProfile(user)

검색한 시간 - 2025-10-31 16:43:06.260883, 사용자 - su********, 실행시간 정보 - 0:00:00.000008

import logging

name = user['name']
print(name)
maskedName = name[:2] + '*' * (len(name) - 2)
print(maskedName)

logging.basicConfig(level=logging.WARNING, 
                    format="%(asctime)s - %(levelname)s - %(message)s", force=True)

superadmin
su********

# Quiz
# 사용자의 이름을 두자리만 출력하고 나머지는 마스킹처리(su********)
# 로깅포맷 
# 로깅포맷형태를 파일로 저장 userAccess.log
# decorator 이용해서 (보안적인 측면을 강화)
# 예외관련해서 필요한 부분이 있다면 추가
from time import time, sleep
def secureLog():
    def wrapper():
        pass

@secureLog
def getProfile(user : dict) -> None :
    print(f'{user['name']} 프로필을 관리자가 검색합니다.')
    sleep(5)

import numpy as np
import pandas as pd

lst = [1,2,3,4,5]
print('type - ', type(lst))

type -  <class 'list'>

ary = np.array(lst)
print('type  - ', type(ary))
print('shape - ', ary.shape)
print('dim   - ', ary.ndim)
print('dtype - ', ary.dtype)

type  -  <class 'numpy.ndarray'>
shape -  (5,)
dim   -  1
dtype -  int64

def aryInfo(ary) :
    print('type  - ', type(ary))
    print('shape - ', ary.shape)
    print('dim   - ', ary.ndim)
    print('dtype - ', ary.dtype)
    print()
    print('data - ')
    print(ary)

aryInfo(ary)

type  -  <class 'numpy.ndarray'>
shape -  (5,)
dim   -  1
dtype -  int64

data - 
[1 2 3 4 5]

lst = [1,2,3,4,5]
# error
# print(lst ** 2) 
result = []
for element in lst :
    result.append(element ** 2)
print(result)

[1, 4, 9, 16, 25]

ary = np.array(lst)
print(ary ** 2)

[ 1  4  9 16 25]

xAry = np.array([1,2,3])
yAry = np.array([3,2,1])

print('산술연산 - ', xAry + yAry)
print('비교연산 - ', xAry == 2)
print('비교연산 - ', xAry > 2)
print('비교연산 - ', (xAry == 2) & (yAry > 2))
print('인덱싱   - ', xAry[0])
print('슬라이싱 - ', xAry[0:2])

x = np.arange(10)
print(x)

산술연산 -  [4 4 4]
비교연산 -  [False  True False]
비교연산 -  [False False  True]
비교연산 -  [False False False]
인덱싱   -  1
슬라이싱 -  [1 2]
[0 1 2 3 4 5 6 7 8 9]

print(' 2 * 3 배열을 생성하고 싶다면 - ')
lst = [[1,2,3],[4,5,6]]
for rIdx, row in enumerate(lst) :
    for cIdx , col in enumerate(row) :
        # print(rIdx, end='\t')
        # print(cIdx, end='\t')
        print(lst[rIdx][cIdx], end='\t')
    print()

 2 * 3 배열을 생성하고 싶다면 - 
1	2	3	
4	5	6

twoAry = np.array(lst)
aryInfo(twoAry)
print()
print('len - ', len(twoAry)) # 행의 길이
print('len - ', len(twoAry[0])) # 열의 길이
print('len - ', len(twoAry[1]))

type  -  <class 'numpy.ndarray'>
shape -  (2, 3)
dim   -  2
dtype -  int64

data - 
[[1 2 3]
 [4 5 6]]

len -  2
len -  3
len -  3

lst = [ [[1,2,3],[4,5,6]], [[1,2,3],[4,5,6]] ]
ary = np.array(lst, dtype=object)
aryInfo(ary)


# print(lst * 2)

type  -  <class 'numpy.ndarray'>
shape -  (2, 2, 3)
dim   -  3
dtype -  object

data - 
[[[1 2 3]
  [4 5 6]]

 [[1 2 3]
  [4 5 6]]]
[[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]]

aryInfo(ary.astype(np.int32))

type  -  <class 'numpy.ndarray'>
shape -  (2, 2, 3)
dim   -  3
dtype -  int32

data - 
[[[1 2 3]
  [4 5 6]]

 [[1 2 3]
  [4 5 6]]]

ary = np.array([0,1,2,3,4,5,6,7,8,9])
print('2의 배수인것들만 출력한다면? - ')

# for i in range(len(ary)):
#     if ary[i] % 2 ==0:
#         print(ary[i])

print(ary%2==0)
print('boolean indexing - ', ary[ary%2==0])

evenIdx = np.array([0,2,4,6,8])
print(ary[evenIdx])

2의 배수인것들만 출력한다면? - 
[ True False  True False  True False  True False  True False]
boolean indexing -  [0 2 4 6 8]
[0 2 4 6 8]

# Quiz
# 3의 배수만 출력
# 4로 나누어 1이 남는 값들만 출력
# 3의 배수이고 4로 나누어 1이 남는 값들만 출력

ary = np.arange(1,21)
aryInfo(ary)
print()
print(ary[ary%3==0])
print(ary[ary%4==1])
print(ary[(ary%3==0) & (ary%4==1)])

type  -  <class 'numpy.ndarray'>
shape -  (20,)
dim   -  1
dtype -  int64

data - 
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20]

[ 3  6  9 12 15 18]
[ 1  5  9 13 17]
[9]

ary = np.arange(1, 13).reshape(3,4)
aryInfo(ary)

type  -  <class 'numpy.ndarray'>
shape -  (3, 4)
dim   -  2
dtype -  int64

data - 
[[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]]

# Quiz
# 정수배열인덱싱을 이용해서 모든행의 0, 3 열의 값을 출력한다면
print(ary[:,[0,3]])
print()
# print(ary[0][0], ary[0,0])
# print(ary[:][0]) # 행의 값
# print(ary[:,0]) # 열의 값
# print()
# 불리언 인덱싱
print(ary[:,[True,False,False,True]])

[[ 1  4]
 [ 5  8]
 [ 9 12]]

[[ 1  4]
 [ 5  8]
 [ 9 12]]

print('5 - ',ary[1,0], type(ary[1,0]))

5 -  5 <class 'numpy.int64'>

# Quiz
# [2 10] 값을 추출 -
print('[2 10] - ', ary[[0,2],1], type(ary[[0,2],1]))

[2 10] -  [ 2 10] <class 'numpy.ndarray'>

# Quiz
# [[1 3] 
#  [9 11]] 값 추출 -

print(ary[[0,2]][:,[0,2]])

[[ 1  3]
 [ 9 11]]

ary = np.arange(1,7).reshape(2,3)
aryInfo(ary)
print()
transposeAry = ary.T
aryInfo(transposeAry)

type  -  <class 'numpy.ndarray'>
shape -  (2, 3)
dim   -  2
dtype -  int64

data - 
[[1 2 3]
 [4 5 6]]

type  -  <class 'numpy.ndarray'>
shape -  (3, 2)
dim   -  2
dtype -  int64

data - 
[[1 4]
 [2 5]
 [3 6]]

print('1차원 배열에 대한 전치행렬이 필요할까요? - 필요함')
print('왜? 함수의 인자로 전달할 때 타입이 맞아야 하므로')
ary = np.arange(1,7)
aryInfo(ary)
print()
transposeAry = ary.reshape(1,6).T
aryInfo(transposeAry)

1차원 배열에 대한 전치행렬이 필요할까요? - 필요함
왜? 함수의 인자로 전달할 때 타입이 맞아야 하므로
type  -  <class 'numpy.ndarray'>
shape -  (6,)
dim   -  1
dtype -  int64

data - 
[1 2 3 4 5 6]

type  -  <class 'numpy.ndarray'>
shape -  (6, 1)
dim   -  2
dtype -  int64

data - 
[[1]
 [2]
 [3]
 [4]
 [5]
 [6]]

aryInfo(transposeAry.flatten())

type  -  <class 'numpy.ndarray'>
shape -  (6,)
dim   -  1
dtype -  int64

data - 
[1 2 3 4 5 6]

print('행의 갯수가 동일한 두개의 배열을 연결 - hstack()')
ary01 = np.ones((3,4))
ary02 = np.zeros((3,4))
print(np.hstack([ary01,ary02]))

행의 갯수가 동일한 두개의 배열을 연결 - hstack()
[[1. 1. 1. 1. 0. 0. 0. 0.]
 [1. 1. 1. 1. 0. 0. 0. 0.]
 [1. 1. 1. 1. 0. 0. 0. 0.]]

print('열의 갯수가 동일한 두개의 배열을 연결 - vstack()')
ary01 = np.ones((3,4))
ary02 = np.zeros((3,4))
print(np.vstack([ary01,ary02]))

열의 갯수가 동일한 두개의 배열을 연결 - vstack()
[[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]

print('stack() - 축 axis = 0 | 1')
ary01 = np.ones((3,4))
ary02 = np.zeros((3,4))
aryInfo(np.stack([ary01,ary02], axis = 0))

stack() - 축 axis = 0 | 1
type  -  <class 'numpy.ndarray'>
shape -  (2, 3, 4)
dim   -  3
dtype -  float64

data - 
[[[1. 1. 1. 1.]
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]]

 [[0. 0. 0. 0.]
  [0. 0. 0. 0.]
  [0. 0. 0. 0.]]]

print('dstack - 행이나 열이 아닌 깊이(depth) 방향으로 배열을 합치는 것')
aryInfo(np.dstack([ary01,ary02]))

dstack - 행이나 열이 아닌 깊이(depth) 방향으로 배열을 합치는 것
type  -  <class 'numpy.ndarray'>
shape -  (3, 4, 2)
dim   -  3
dtype -  float64

data - 
[[[1. 0.]
  [1. 0.]
  [1. 0.]
  [1. 0.]]

 [[1. 0.]
  [1. 0.]
  [1. 0.]
  [1. 0.]]

 [[1. 0.]
  [1. 0.]
  [1. 0.]
  [1. 0.]]]

ary = np.arange(1,7).reshape(-1,3) # -1 = 행은 신경 X 열의 갯수를 맞춰서 맞춰달라
np.tile(ary, (3,3))

array([[1, 2, 3, 1, 2, 3, 1, 2, 3],
       [4, 5, 6, 4, 5, 6, 4, 5, 6],
       [1, 2, 3, 1, 2, 3, 1, 2, 3],
       [4, 5, 6, 4, 5, 6, 4, 5, 6],
       [1, 2, 3, 1, 2, 3, 1, 2, 3],
       [4, 5, 6, 4, 5, 6, 4, 5, 6]])

# Quiz
# np.function() 결과가 동일하도록 구현한다면?
# array([[   0.,    0.,    0.,    1.,    1.],
#        [   0.,    0.,    0.,    1.,    1.],
#        [   0.,    0.,    0.,    1.,    1.],
#        [  10.,   20.,   30.,   40.,   50.],
#        [  60.,   70.,   80.,   90.,  100.],
#        [ 110.,  120.,  130.,  140.,  150.],
#        [   0.,    0.,    0.,    1.,    1.],
#        [   0.,    0.,    0.,    1.,    1.],
#        [   0.,    0.,    0.,    1.,    1.],
#        [  10.,   20.,   30.,   40.,   50.],
#        [  60.,   70.,   80.,   90.,  100.],
#        [ 110.,  120.,  130.,  140.,  150.]])


ary01 = np.zeros((3,3))
ary02 = np.ones((3,2))
ary03 = np.arange(10,160,10).reshape(3,5)
ary04 = (np.hstack([ary01,ary02]))
ary05 = (np.vstack([ary04,ary03]))
#print(ary05)
print(np.tile(ary05, (2,1)))

[[  0.   0.   0.   1.   1.]
 [  0.   0.   0.   1.   1.]
 [  0.   0.   0.   1.   1.]
 [ 10.  20.  30.  40.  50.]
 [ 60.  70.  80.  90. 100.]
 [110. 120. 130. 140. 150.]
 [  0.   0.   0.   1.   1.]
 [  0.   0.   0.   1.   1.]
 [  0.   0.   0.   1.   1.]
 [ 10.  20.  30.  40.  50.]
 [ 60.  70.  80.  90. 100.]
 [110. 120. 130. 140. 150.]]

ary = np.random.randint(0,10, (3,4))
print(ary)

[[0 4 8 8]
 [8 6 2 0]
 [8 1 7 5]]

np.delete(ary, 1, axis = 0)
newAry = np.delete(ary, 1)
print(newAry)

[0 8 8 8 6 2 0 8 1 7 5]

print(ary)

[[0 4 8 8]
 [8 6 2 0]
 [8 1 7 5]]

ary = np.array([1,2,3,4,5,6,7,8,9,10])
print(np.sum(ary), np.mean(ary), np.median(ary), np.var(ary), np.std(ary))

55 5.5 5.5 8.25 2.8722813232690143

# 분위수
print('quartile - ', np.percentile(ary, 0))
print('quartile - ', np.percentile(ary, 25))
print('quartile - ', np.percentile(ary, 50))
print('quartile - ', np.percentile(ary, 75))
print('quartile - ', np.percentile(ary, 100))

quartile -  1.0
quartile -  3.25
quartile -  5.5
quartile -  7.75
quartile -  10.0

print('argmin - ', ary[np.argmin(ary)])
print('argmax - ', ary[np.argmax(ary)])
print()
print('any - ', np.any([True, False, True])) # 하나라도 참이면 True
print('all - ', np.all([True, False, True])) # 전체가 같지 않으면 False

argmin -  1
argmax -  10

any -  True
all -  False

ary = np.arange(1,21).reshape(4, -1)
print(ary, type(ary))
aryInfo(ary)

[[ 1  2  3  4  5]
 [ 6  7  8  9 10]
 [11 12 13 14 15]
 [16 17 18 19 20]] <class 'numpy.ndarray'>
type  -  <class 'numpy.ndarray'>
shape -  (4, 5)
dim   -  2
dtype -  int64

data - 
[[ 1  2  3  4  5]
 [ 6  7  8  9 10]
 [11 12 13 14 15]
 [16 17 18 19 20]]

print('2차원 배열에서 차원축소함수를 적용한다면 - 축의 방향이 중요!! axis = 0 , 1')
print('sum - ', np.sum(ary))
print('sum - ', np.sum(ary, axis = 0))
print('sum - ', np.sum(ary, axis = 1))

2차원 배열에서 차원축소함수를 적용한다면 - 축의 방향이 중요!! axis = 0 , 1
sum -  210
sum -  [34 38 42 46 50]
sum -  [15 40 65 90]

print(np.hstack([ary, np.sum(ary, axis = 1).reshape(1,4).T]))

[[ 1  2  3  4  5 15]
 [ 6  7  8  9 10 40]
 [11 12 13 14 15 65]
 [16 17 18 19 20 90]]

ary = np.random.randint(0,1000000, (5,6))
print(ary)

[[326816 764776 245633 331187 837845 278962]
 [298898  93200 315971 654648  22809 622964]
 [213983 730979 540261  56986 382041 836560]
 [345458 722928  56775 945598 580785 412014]
 [881451 198859 286796 343397 308836 366741]]

# Quiz
# 전체의 최대값
print('max - \n', np.max(ary))
# 행 합
print('sum - \n', np.sum(ary, axis = 1))
# 행 합을 열로 추가
print(np.hstack([ary, np.sum(ary, axis = 1).reshape(5,1)]))
# 각 행의 최대값
print('max - \n', np.max(ary, axis = 1))
# 각 행의 최대값을 열로 추가
print('max - \n', np.hstack([ary, np.max(ary, axis = 1).reshape(5,1)]))
# 각 열의 평균을 행으로 추가
print('col mean - \n', np.vstack([ary, np.mean(ary, axis = 0).astype('int32').reshape(1,-1)]))
# 각 열의 최소값
print('col min - \n', np.min(ary, axis = 0))

max - 
 993611
sum - 
 [2355134 2759164 3753749 1562788 3120634]
[[ 272663  341040    8826  499369  730658  502578 2355134]
 [ 384805  210361  798350  733100   73382  559166 2759164]
 [ 159585  755671  420272  922974  566827  928420 3753749]
 [  41312   76008  593076   97690  220786  533916 1562788]
 [ 993611  309508   75527  847784  517204  377000 3120634]]
max - 
 [730658 798350 928420 593076 993611]
max - 
 [[272663 341040   8826 499369 730658 502578 730658]
 [384805 210361 798350 733100  73382 559166 798350]
 [159585 755671 420272 922974 566827 928420 928420]
 [ 41312  76008 593076  97690 220786 533916 593076]
 [993611 309508  75527 847784 517204 377000 993611]]
col mean - 
 [[272663 341040   8826 499369 730658 502578]
 [384805 210361 798350 733100  73382 559166]
 [159585 755671 420272 922974 566827 928420]
 [ 41312  76008 593076  97690 220786 533916]
 [993611 309508  75527 847784 517204 377000]
 [370395 338517 379210 620183 421771 580216]]
col min - 
 [ 41312  76008   8826  97690  73382 377000]

ary = np.arange(10)
print(ary)
print()
np.random.shuffle(ary)
print()
print(ary)
print()
print('np sort - ', np.sort(ary)[::-1]) # 원본에 대한 변경이 없다 변경한 데이터를 유지하려면 변수에 담아야햠
print(ary)
ary.sort() # 원본이 바뀜
print(ary)

[0 1 2 3 4 5 6 7 8 9]


[8 2 0 7 3 6 5 4 9 1]

np sort -  [9 8 7 6 5 4 3 2 1 0]
[8 2 0 7 3 6 5 4 9 1]
[0 1 2 3 4 5 6 7 8 9]

ary = np.random.randint(1,17,(4,4))
ary

array([[13,  1,  8,  6],
       [ 1,  7, 12, 10],
       [ 7, 16,  9,  5],
       [ 3, 13,  4, 12]], dtype=int32)

print('열에 대한 정렬 - ')
print(np.sort(ary, axis = 0))
print()
print('행 대한 정렬 - ')
print(np.sort(ary, axis = 1))

열에 대한 정렬 - 
[[ 1  1  4  5]
 [ 3  7  8  6]
 [ 7 13  9 10]
 [13 16 12 12]]

행 대한 정렬 - 
[[ 1  6  8 13]
 [ 1  7 10 12]
 [ 5  7  9 16]
 [ 3  4 12 13]]

ary = np.array([4,3,5,7])
print(ary)

[4 3 5 7]

sortIdx = np.argsort(ary)
print(sortIdx)

[1 0 2 3]

print(ary[sortIdx])
print(ary[sortIdx][::-1])

[3 4 5 7]
[7 5 4 3]

ary = np.random.randint(1,17,(4,4))
print(ary)

[[ 4 11  1  3]
 [ 1 16  2 12]
 [16  1  4 12]
 [12  2 13 11]]

np.argsort(ary, axis = 0)

array([[1, 2, 0, 0],
       [0, 3, 1, 3],
       [3, 0, 2, 1],
       [2, 1, 3, 2]])

import numpy as np
import pandas as pd

def aryInfo(ary) :
    print('type  - ', type(ary))
    print('shape - ', ary.shape)
    print('dim   - ', ary.ndim)
    print('dtype - ', ary.dtype)
    print()
    print('data - ')
    print(ary)

rawData = np.loadtxt('./data/기후통계분석.csv',
                    dtype = 'U',
                    skiprows = 1,
                    delimiter = ',')

#print('type - ', type(rawData))
aryInfo(rawData)

type  -  <class 'numpy.ndarray'>
shape -  (40414, 5)
dim   -  2
dtype -  <U10

data - 
[['1907-10-01' '108' '13.5' '7.9' '20.7']
 ['1907-10-02' '108' '16.2' '7.9' '22']
 ['1907-10-03' '108' '16.2' '13.1' '21.3']
 ...
 ['2021-08-23' '108' '22.4' '21' '24']
 ['2021-08-24' '108' '23.4' '21.1' '26.4']
 ['2021-08-25' '108' '25' '23.5' '27.3']]

rawData[0 : 6, : ]

array([['1907-10-01', '108', '13.5', '7.9', '20.7'],
       ['1907-10-02', '108', '16.2', '7.9', '22'],
       ['1907-10-03', '108', '16.2', '13.1', '21.3'],
       ['1907-10-04', '108', '16.5', '11.2', '22'],
       ['1907-10-05', '108', '17.6', '10.9', '25.4'],
       ['1907-10-06', '108', '13', '11.2', '21.3']], dtype='<U10')

temp = rawData[: , -1]
#print(temp)
aryInfo(temp)

type  -  <class 'numpy.ndarray'>
shape -  (40414,)
dim   -  1
dtype -  <U10

data - 
['20.7' '22' '21.3' ... '24' '26.4' '27.3']

print('data - ', temp[:10])

data -  ['20.7' '22' '21.3' '22' '25.4' '21.3' '16.1' '14.9' '21.1' '24.1']

temp = temp.astype(float)
print('data - ', temp[:10])

data -  [20.7 22.  21.3 22.  25.4 21.3 16.1 14.9 21.1 24.1]

# Quiz
# 최고기온이 가장 높은 년도의 기후정보를 확인하고 싶다면?

print(rawData[np.argmax(temp)])

maxIdx = np.argmax(rawData[:, 4].astype(float))
print(rawData[maxIdx])

print('max - ', np.max(temp))
print('argmax - ', np.argmax(temp))
print('argsort - ', np.argsort(temp)[-1])
print('argsort - ', np.argsort(temp)[::-1][0])
print()
print('answer - ', rawData[np.argsort(temp)[::-1][0], :])

['2018-08-01' '108' '33.6' '27.8' '39.6']
['2018-08-01' '108' '33.6' '27.8' '39.6']
max -  39.6
argmax -  39293
argsort -  39293
argsort -  39293

answer -  ['2018-08-01' '108' '33.6' '27.8' '39.6']

# Quiz
# 평균기온이 가장 낮은 년도의 기후정보를 확인하고 싶다면?
temp = rawData[: , 2]

print('answer - ', rawData[np.argmin(temp.astype(float)) , :])

answer -  ['1915-01-13' '108' '-19.2' '-21.3' '-16.3']

ary01 = np.arange(1,100001)
ary02 = np.arange(100001, 200001)
print('len - ', len(ary02), len(ary02))

len -  100000 100000

tempAry = np.zeros_like(ary01)
print(tempAry, len(tempAry))

[0 0 0 ... 0 0 0] 100000

%%time
for idx in range(len(ary02)) :
    tempAry[idx] = ary01[idx] + ary02[idx]
print()
print('answer - ', tempAry)

answer -  [100002 100004 100006 ... 299996 299998 300000]
CPU times: total: 31.2 ms
Wall time: 27.9 ms

%%time
tempAry = ary01 + ary02
print()
print('answer - ', tempAry)

answer -  [100002 100004 100006 ... 299996 299998 300000]
CPU times: total: 0 ns
Wall time: 895 μs

ary01 = np.arange(3)
print(ary01)
print(ary01 * 3)

[0 1 2]
[0 3 6]

ary02 = np.arange(12).reshape(-1, 4)
aryInfo(ary02)

type  -  <class 'numpy.ndarray'>
shape -  (3, 4)
dim   -  2
dtype -  int64

data - 
[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]

ary01[ : , np.newaxis] + ary02

array([[ 0,  1,  2,  3],
       [ 5,  6,  7,  8],
       [10, 11, 12, 13]])

ary = np.array([1,2,3,4])
aryInfo(ary)

type  -  <class 'numpy.ndarray'>
shape -  (4,)
dim   -  1
dtype -  int64

data - 
[1 2 3 4]

newAry = ary[ : , np.newaxis]
aryInfo(newAry)

type  -  <class 'numpy.ndarray'>
shape -  (4, 1)
dim   -  2
dtype -  int64

data - 
[[1]
 [2]
 [3]
 [4]]

newAry = ary[ np.newaxis, :]
aryInfo(newAry)

type  -  <class 'numpy.ndarray'>
shape -  (1, 4)
dim   -  2
dtype -  int64

data - 
[[1 2 3 4]]

'''
series(index + value)
index : 정수, 문자, 날짜, 시간 가능하고 중복허용 X
'''

'\nseries(index + value)\nindex : 정수, 문자, 날짜, 시간 가능하고 중복허용 X\n'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json as j

# warning 제거
import warnings
warnings.filterwarnings('ignore')

print('numpy version - ', np.__version__)
print('pandas version - ', pd.__version__)

numpy version -  2.1.3
pandas version -  2.2.3

lst = [1,2,3,4,5]
print('type - ', type(lst))

print()
ary = np.array(lst)
print('type - ', type(ary))

print()
series = pd.Series(ary)
print('type - ', type(series))
print()
print(series)
print('index - ', series.index)
print('values - ', series.values , type(series.values))
print('dtype - ', series.dtype)

type -  <class 'list'>

type -  <class 'numpy.ndarray'>

type -  <class 'pandas.core.series.Series'>

0    1
1    2
2    3
3    4
4    5
dtype: int64
index -  RangeIndex(start=0, stop=5, step=1)
values -  [1 2 3 4 5] <class 'numpy.ndarray'>
dtype -  int64

def aryInfo(ary) :
    print('type  - ', type(ary))
    print('shape - ', ary.shape)
    print('dim   - ', ary.ndim)
    print('dtype - ', ary.dtype)
    print()
    print('data - ')
    print(ary)
    
def seriesInfo(s) :
    print('type - ', type(s))
    print('index - ', s.index)
    print('values - ', s.values)
    print('dtype - ', s.dtype)
    print()
    print('data - ')
    print(s)

def frmInfo(frm):
    print('type - ', type(frm))
    print('shape - ', frm.shape)
    print('ndim - ', frm.ndim)
    print('row idx - ', frm.index, type(frm.index))
    print('col idx - ', frm.columns, type(frm.columns))
    print('values - ', type(frm.values) )
    print(frm.values)
    print('data - ')
    print(frm)

# 문자인덱스로 시리즈 만든다면?
series = pd.Series({'idx01' : 1, 'idx02' : 2, 'idx03' : 3})
seriesInfo(series)

type -  <class 'pandas.core.series.Series'>
index -  Index(['idx01', 'idx02', 'idx03'], dtype='object')
values -  [1 2 3]
dtype -  int64

data - 
idx01    1
idx02    2
idx03    3
dtype: int64

# series = pd.Series(data = [1,2,3,4,5],
#                     index = ['서초', '송파', '강남', '삼성', '중구'] )
# seriesInfo(series)

series = pd.Series(data = ['임섭순','2025-11-04','Male',True],
                    index = ['이름', '생년월일', '성별', '결혼여부'] )
series.name = '사용자 정보'
series.index.name = '신상 정보'
seriesInfo(series)

print()
print('index - ', series[0], series['이름'])
print('multi indexing - ', series[[0,2]], type(series[[0,2]]))
print(series[['이름', '성별']], type(series[['이름', '성별']]))
print()
print('slicing - ', series[0:3], series['이름':'성별'])

type -  <class 'pandas.core.series.Series'>
index -  Index(['이름', '생년월일', '성별', '결혼여부'], dtype='object', name='신상 정보')
values -  ['임섭순' '2025-11-04' 'Male' True]
dtype -  object

data - 
신상 정보
이름             임섭순
생년월일    2025-11-04
성별            Male
결혼여부          True
Name: 사용자 정보, dtype: object

index -  임섭순 임섭순
multi indexing -  신상 정보
이름     임섭순
성별    Male
Name: 사용자 정보, dtype: object <class 'pandas.core.series.Series'>
신상 정보
이름     임섭순
성별    Male
Name: 사용자 정보, dtype: object <class 'pandas.core.series.Series'>

slicing -  신상 정보
이름             임섭순
생년월일    2025-11-04
성별            Male
Name: 사용자 정보, dtype: object 신상 정보
이름             임섭순
생년월일    2025-11-04
성별            Male
Name: 사용자 정보, dtype: object

print('dir - ', dir(series))

dir -  ['T', '_AXIS_LEN', '_AXIS_ORDERS', '_AXIS_TO_AXIS_NUMBER', '_HANDLED_TYPES', '__abs__', '__add__', '__and__', '__annotations__', '__array__', '__array_priority__', '__array_ufunc__', '__bool__', '__class__', '__column_consortium_standard__', '__contains__', '__copy__', '__deepcopy__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__divmod__', '__doc__', '__eq__', '__finalize__', '__firstlineno__', '__float__', '__floordiv__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__iadd__', '__iand__', '__ifloordiv__', '__imod__', '__imul__', '__init__', '__init_subclass__', '__int__', '__invert__', '__ior__', '__ipow__', '__isub__', '__iter__', '__itruediv__', '__ixor__', '__le__', '__len__', '__lt__', '__matmul__', '__mod__', '__module__', '__mul__', '__ne__', '__neg__', '__new__', '__nonzero__', '__or__', '__pandas_priority__', '__pos__', '__pow__', '__radd__', '__rand__', '__rdivmod__', '__reduce__', '__reduce_ex__', '__repr__', '__rfloordiv__', '__rmatmul__', '__rmod__', '__rmul__', '__ror__', '__round__', '__rpow__', '__rsub__', '__rtruediv__', '__rxor__', '__setattr__', '__setitem__', '__setstate__', '__sizeof__', '__static_attributes__', '__str__', '__sub__', '__subclasshook__', '__truediv__', '__weakref__', '__xor__', '_accessors', '_accum_func', '_agg_examples_doc', '_agg_see_also_doc', '_align_for_op', '_align_frame', '_align_series', '_append', '_arith_method', '_as_manager', '_attrs', '_binop', '_can_hold_na', '_check_inplace_and_allows_duplicate_labels', '_check_is_chained_assignment_possible', '_check_label_or_level_ambiguity', '_check_setitem_copy', '_clear_item_cache', '_clip_with_one_bound', '_clip_with_scalar', '_cmp_method', '_consolidate', '_consolidate_inplace', '_construct_axes_dict', '_construct_result', '_constructor', '_constructor_expanddim', '_constructor_expanddim_from_mgr', '_constructor_from_mgr', '_data', '_deprecate_downcast', '_dir_additions', '_dir_deletions', '_drop_axis', '_drop_labels_or_levels', '_duplicated', '_find_valid_index', '_flags', '_flex_method', '_from_mgr', '_get_axis', '_get_axis_name', '_get_axis_number', '_get_axis_resolvers', '_get_block_manager_axis', '_get_bool_data', '_get_cacher', '_get_cleaned_column_resolvers', '_get_index_resolvers', '_get_label_or_level_values', '_get_numeric_data', '_get_rows_with_mask', '_get_value', '_get_values_tuple', '_get_with', '_getitem_slice', '_gotitem', '_hidden_attrs', '_indexed_same', '_info_axis', '_info_axis_name', '_info_axis_number', '_init_dict', '_init_mgr', '_inplace_method', '_internal_names', '_internal_names_set', '_is_cached', '_is_copy', '_is_label_or_level_reference', '_is_label_reference', '_is_level_reference', '_is_mixed_type', '_is_view', '_is_view_after_cow_rules', '_item_cache', '_ixs', '_logical_func', '_logical_method', '_map_values', '_maybe_update_cacher', '_memory_usage', '_metadata', '_mgr', '_min_count_stat_function', '_name', '_needs_reindex_multi', '_pad_or_backfill', '_protect_consolidate', '_reduce', '_references', '_reindex_axes', '_reindex_indexer', '_reindex_multi', '_reindex_with_indexers', '_rename', '_replace_single', '_repr_data_resource_', '_repr_latex_', '_reset_cache', '_reset_cacher', '_set_as_cached', '_set_axis', '_set_axis_name', '_set_axis_nocheck', '_set_is_copy', '_set_labels', '_set_name', '_set_value', '_set_values', '_set_with', '_set_with_engine', '_shift_with_freq', '_slice', '_stat_function', '_stat_function_ddof', '_take_with_is_copy', '_to_latex_via_styler', '_typ', '_update_inplace', '_validate_dtype', '_values', '_where', 'abs', 'add', 'add_prefix', 'add_suffix', 'agg', 'aggregate', 'align', 'all', 'any', 'apply', 'argmax', 'argmin', 'argsort', 'array', 'asfreq', 'asof', 'astype', 'at', 'at_time', 'attrs', 'autocorr', 'axes', 'backfill', 'between', 'between_time', 'bfill', 'bool', 'case_when', 'clip', 'combine', 'combine_first', 'compare', 'convert_dtypes', 'copy', 'corr', 'count', 'cov', 'cummax', 'cummin', 'cumprod', 'cumsum', 'describe', 'diff', 'div', 'divide', 'divmod', 'dot', 'drop', 'drop_duplicates', 'droplevel', 'dropna', 'dtype', 'dtypes', 'duplicated', 'empty', 'eq', 'equals', 'ewm', 'expanding', 'explode', 'factorize', 'ffill', 'fillna', 'filter', 'first', 'first_valid_index', 'flags', 'floordiv', 'ge', 'get', 'groupby', 'gt', 'hasnans', 'head', 'hist', 'iat', 'idxmax', 'idxmin', 'iloc', 'index', 'infer_objects', 'info', 'interpolate', 'is_monotonic_decreasing', 'is_monotonic_increasing', 'is_unique', 'isin', 'isna', 'isnull', 'item', 'items', 'keys', 'kurt', 'kurtosis', 'last', 'last_valid_index', 'le', 'list', 'loc', 'lt', 'map', 'mask', 'max', 'mean', 'median', 'memory_usage', 'min', 'mod', 'mode', 'mul', 'multiply', 'name', 'nbytes', 'ndim', 'ne', 'nlargest', 'notna', 'notnull', 'nsmallest', 'nunique', 'pad', 'pct_change', 'pipe', 'plot', 'pop', 'pow', 'prod', 'product', 'quantile', 'radd', 'rank', 'ravel', 'rdiv', 'rdivmod', 'reindex', 'reindex_like', 'rename', 'rename_axis', 'reorder_levels', 'repeat', 'replace', 'resample', 'reset_index', 'rfloordiv', 'rmod', 'rmul', 'rolling', 'round', 'rpow', 'rsub', 'rtruediv', 'sample', 'searchsorted', 'sem', 'set_axis', 'set_flags', 'shape', 'shift', 'size', 'skew', 'sort_index', 'sort_values', 'squeeze', 'std', 'str', 'struct', 'sub', 'subtract', 'sum', 'swapaxes', 'swaplevel', 'tail', 'take', 'to_clipboard', 'to_csv', 'to_dict', 'to_excel', 'to_frame', 'to_hdf', 'to_json', 'to_latex', 'to_list', 'to_markdown', 'to_numpy', 'to_period', 'to_pickle', 'to_sql', 'to_string', 'to_timestamp', 'to_xarray', 'transform', 'transpose', 'truediv', 'truncate', 'tz_convert', 'tz_localize', 'unique', 'unstack', 'update', 'value_counts', 'values', 'var', 'view', 'where', 'xs', '결혼여부', '생년월일', '성별', '이름']

keyLst = [key for key in series.keys()]
print('key list - ', keyLst)
print()
valueLst = [value for value in series.values]
print('value list - ', valueLst)

key list -  ['이름', '생년월일', '성별', '결혼여부']

value list -  ['임섭순', '2025-11-04', 'Male', True]

series = pd.Series(range(10,21))
seriesInfo(series)

type -  <class 'pandas.core.series.Series'>
index -  RangeIndex(start=0, stop=11, step=1)
values -  [10 11 12 13 14 15 16 17 18 19 20]
dtype -  int64

data - 
0     10
1     11
2     12
3     13
4     14
5     15
6     16
7     17
8     18
9     19
10    20
dtype: int64

print(series * 2)
print(series.values * 2 , type(series.values * 2))
print('boolean indexing - ')
print(series.values%2 ==0)
print(series.values[series.values%2 ==0])

0     20
1     22
2     24
3     26
4     28
5     30
6     32
7     34
8     36
9     38
10    40
dtype: int64
[20 22 24 26 28 30 32 34 36 38 40] <class 'numpy.ndarray'>
boolean indexing - 
[ True False  True False  True False  True False  True False  True]
[10 12 14 16 18 20]

from datetime import date, datetime, timedelta

today = date(2025, 11, 4)
print(today)

2025-11-04

# 오늘 날짜부터 10일간의 날짜를 생성하여 시리즈의 인덱스로 사용하고 싶다면?

today = date(2025, 11, 4)
idx = [today + timedelta(days=i) for i in range(10)]
s = pd.Series(range(10), index=dates)
print(s)

2025-11-04    0
2025-11-05    1
2025-11-06    2
2025-11-07    3
2025-11-08    4
2025-11-09    5
2025-11-10    6
2025-11-11    7
2025-11-12    8
2025-11-13    9
dtype: int64

date_index = pd.date_range(start = today, periods = 10)
print(date_index)

DatetimeIndex(['2025-11-04', '2025-11-05', '2025-11-06', '2025-11-07',
               '2025-11-08', '2025-11-09', '2025-11-10', '2025-11-11',
               '2025-11-12', '2025-11-13'],
              dtype='datetime64[ns]', freq='D')

series = pd.Series(data = [np.random.randint(1,100) for _ in range(10)],
                  index = date_index)

print(series)

2025-11-04    57
2025-11-05    85
2025-11-06    43
2025-11-07    85
2025-11-08    44
2025-11-09    33
2025-11-10    83
2025-11-11    48
2025-11-12    64
2025-11-13    17
Freq: D, dtype: int64

print(series['2025-11-04'])

57

# 결측값, null : isnull(), notnull()
series['2025-11-10'] = np.nan
print(series)

2025-11-04    57.0
2025-11-05    85.0
2025-11-06    43.0
2025-11-07    85.0
2025-11-08    44.0
2025-11-09    33.0
2025-11-10     NaN
2025-11-11    48.0
2025-11-12    64.0
2025-11-13    17.0
Freq: D, dtype: float64

pd.isnull(series)
print()
pd.notnull(series)

2025-11-04     True
2025-11-05     True
2025-11-06     True
2025-11-07     True
2025-11-08     True
2025-11-09     True
2025-11-10    False
2025-11-11     True
2025-11-12     True
2025-11-13     True
Freq: D, dtype: bool

# 결측값은 평균이나 중위수의 값을 대체
series[pd.isnull(series)] = np.mean(series)
print(series)

2025-11-04    57.000000
2025-11-05    85.000000
2025-11-06    43.000000
2025-11-07    85.000000
2025-11-08    44.000000
2025-11-09    33.000000
2025-11-10    52.888889
2025-11-11    48.000000
2025-11-12    64.000000
2025-11-13    17.000000
Freq: D, dtype: float64

# fillna() : 결측값을 원하는 값으로 채우고자 할 때
series['2025-11-10'] = np.nan
print(series)

2025-11-04    57.0
2025-11-05    85.0
2025-11-06    43.0
2025-11-07    85.0
2025-11-08    44.0
2025-11-09    33.0
2025-11-10     NaN
2025-11-11    48.0
2025-11-12    64.0
2025-11-13    17.0
Freq: D, dtype: float64

series = series.fillna(0)
print(series)

2025-11-04    57.0
2025-11-05    85.0
2025-11-06    43.0
2025-11-07    85.0
2025-11-08    44.0
2025-11-09    33.0
2025-11-10     0.0
2025-11-11    48.0
2025-11-12    64.0
2025-11-13    17.0
Freq: D, dtype: float64

# subset (부분집합)
series[3] = np.nan
series[8] = np.nan
print(series)

2025-11-04    57.0
2025-11-05    85.0
2025-11-06    43.0
2025-11-07     NaN
2025-11-08    44.0
2025-11-09    33.0
2025-11-10     0.0
2025-11-11    48.0
2025-11-12     NaN
2025-11-13    17.0
Freq: D, dtype: float64

# Quiz
# 원본데이터에서 결측값을 제외한 subset을 만들고 싶다면?
subset = series[pd.notnull(series)]
print(subset)

2025-11-04    57.0
2025-11-05    85.0
2025-11-06    43.0
2025-11-08    44.0
2025-11-09    33.0
2025-11-10     0.0
2025-11-11    48.0
2025-11-13    17.0
dtype: float64

'''
DataFrame(표 형식 - 행, 열) : .csv 을 읽어서 표로
- series는 인덱스만 있었지만
- 행 인덱스, 열 인덱스가 있다
- pd.DataFrame(data= ,columns= ,index= )
- dict, [[]] , 
'''

'\nDataFrame(표 형식 - 행, 열) : .csv 을 읽어서 표로\n- series는 인덱스만 있었지만\n- 행 인덱스, 열 인덱스가 있다\n- pd.DataFrame(data= ,columns= ,index= )\n- dict, [[]] , \n'

frm = pd.DataFrame({
    'feature01' : [1,2,3],
    'feature02' : [1,2,3],
    'feature03' : [1,2,3],
})
print(frm)
print('type - ', type(frm))
print('shape - ', frm.shape)
print('ndim - ', frm.ndim)
print('row idx - ', frm.index, type(frm.index))
print('col idx - ', frm.columns, type(frm.columns))
print('values - ', type(frm.values) )
print(frm.values)

   feature01  feature02  feature03
0          1          1          1
1          2          2          2
2          3          3          3
type -  <class 'pandas.core.frame.DataFrame'>
shape -  (3, 3)
ndim -  2
row idx -  RangeIndex(start=0, stop=3, step=1) <class 'pandas.core.indexes.range.RangeIndex'>
col idx -  Index(['feature01', 'feature02', 'feature03'], dtype='object') <class 'pandas.core.indexes.base.Index'>
values -  <class 'numpy.ndarray'>
[[1 1 1]
 [2 2 2]
 [3 3 3]]

frm = pd.DataFrame(data = 
                    [[1,2,3],
                    [1,2,3],
                    [1,2,3]],
                    columns = ['A', 'B', 'C'],
                    index = ['user_'+str(idx) for idx in range(3)])

frmInfo(frm)

type -  <class 'pandas.core.frame.DataFrame'>
shape -  (3, 3)
ndim -  2
row idx -  Index(['user_0', 'user_1', 'user_2'], dtype='object') <class 'pandas.core.indexes.base.Index'>
col idx -  Index(['A', 'B', 'C'], dtype='object') <class 'pandas.core.indexes.base.Index'>
values -  <class 'numpy.ndarray'>
[[1 2 3]
 [1 2 3]
 [1 2 3]]
data - 
        A  B  C
user_0  1  2  3
user_1  1  2  3
user_2  1  2  3

print('전처리(pre-processing) - 열 이름 변경, 인덱스 변경, rename()')
frm.rename(columns = {'A':'name', 'B':'gender', 'C':'isMarraige'},
          inplace = True,
          index = {'user_1' : 'customer01'})

전처리(pre-processing) - 열 이름 변경, 인덱스 변경, rename()

frmInfo(frm)

type -  <class 'pandas.core.frame.DataFrame'>
shape -  (3, 3)
ndim -  2
row idx -  Index(['user_0', 'customer01', 'user_2'], dtype='object') <class 'pandas.core.indexes.base.Index'>
col idx -  Index(['name', 'gender', 'isMarraige'], dtype='object') <class 'pandas.core.indexes.base.Index'>
values -  <class 'numpy.ndarray'>
[[1 2 3]
 [1 2 3]
 [1 2 3]]
data - 
            name  gender  isMarraige
user_0         1       2           3
customer01     1       2           3
user_2         1       2           3

print('데이터 추출 - indexing')
print('type - ', type(frm['name']))
print(frm['name'], type(frm['name']))
print(frm['name'].values, type(frm['name'].values))

데이터 추출 - indexing
type -  <class 'pandas.core.series.Series'>
user_0        1
customer01    1
user_2        1
Name: name, dtype: int64 <class 'pandas.core.series.Series'>
[1 1 1] <class 'numpy.ndarray'>

print('데이터 추가')
frm['age'] = [10,20,30]

데이터 추가

print(frm)

            name  gender  isMarraige  age
user_0         1       2           3   10
customer01     1       2           3   20
user_2         1       2           3   30

print('삭제 - 열')
del frm['age']

삭제 - 열

print(frm)

            name  gender  isMarraige
user_0         1       2           3
customer01     1       2           3
user_2         1       2           3

print('만약 , 행 인덱싱을 하고 싶다면? - 정답 : 슬라이싱(정수, 문자)')
print(frm)
print()
print(frm[0:1]) 
print()
print(frm[ : 'customer01'])

만약 , 행 인덱싱을 하고 싶다면? - 정답 : 슬라이싱(정수, 문자)
            name  gender  isMarraige
user_0         1       2           3
customer01     1       2           3
user_2         1       2           3

        name  gender  isMarraige
user_0     1       2           3

            name  gender  isMarraige
user_0         1       2           3
customer01     1       2           3

print(frm['name'][:'customer01'])

user_0        1
customer01    1
Name: name, dtype: int64

import urllib.request
import json

# sample json api
endPoint = 'https://jsonplaceholder.typicode.com/posts'
response = urllib.request.urlopen(endPoint)
print('response - ')
# print(response.read())
# json parsing
result = json.loads(response.read())
# print(result)
print('type - ', type(result))
print('keys - ', result[0].keys())

response - 
type -  <class 'list'>
keys -  dict_keys(['userId', 'id', 'title', 'body'])

frm = pd.DataFrame(result)

#frmInfo(frm)

# Quiz
# https://dummyjson.com/carts
# 해당 사이트로부터 json 데이터를 불러와 각 장바구니의 userId, total, discountedTotal
# 그리고 포함된 각 product의 title, price, quantity
# 이걸 하나의 행으로 나타내는 frm 생성한다면?
import urllib.request
import json

endPoint = 'https://dummyjson.com/carts'
response = urllib.request.urlopen(endPoint)
result = json.loads(response.read())

carts = result['carts']

rows = []

for cart in carts:
    for prod in cart['products']:
        rows.append({
            "userId" : cart["userId"],
            "total" : cart["total"],
            "discountedTotal" : cart["discountedTotal"],
            "title" : prod["title"],
            "price" : prod["price"],
            "quantity" : prod["quantity"]
        })

frm = pd.DataFrame(rows)
print(frm.head())

   userId      total  discountedTotal                                 title  \
0      33  103774.85         89686.65                       Charger SXT RWD   
1      33  103774.85         89686.65  Apple MacBook Pro 14 Inch Space Grey   
2      33  103774.85         89686.65                    Green Oval Earring   
3      33  103774.85         89686.65                         Apple Airpods   
4     142    4794.80          4288.95                        Cricket Helmet   

      price  quantity  
0  32999.99         3  
1   1999.99         2  
2     24.99         5  
3    129.99         5  
4     44.99         4

endPoint = 'https://dummyjson.com/carts'
response = urllib.request.urlopen(endPoint)
result = json.loads(response.read())

carts = result['carts']

rows = []
for cart in carts:
    userId = cart['userId']
    total = cart['total']
    discountedTotal = cart['discountedTotal']
    
    for p in cart['products']:
        rows.append({
            'userId': userId,
            'total': total,
            'discountedTotal': discountedTotal,
            'title': p['title'],
            'price': p['price'],
            'quantity': p['quantity']
        })


frm = pd.DataFrame(rows)
print(frm)

     userId      total  discountedTotal                                 title  \
0        33  103774.85         89686.65                       Charger SXT RWD   
1        33  103774.85         89686.65  Apple MacBook Pro 14 Inch Space Grey   
2        33  103774.85         89686.65                    Green Oval Earring   
3        33  103774.85         89686.65                         Apple Airpods   
4       142    4794.80          4288.95                        Cricket Helmet   
..      ...        ...              ...                                   ...   
116     170    3862.43          3488.44                            Volleyball   
117     177  128249.07        118740.76                Marni Red & Black Suit   
118     177  128249.07        118740.76                      Pacifica Touring   
119     177  128249.07        118740.76                              Potatoes   
120     177  128249.07        118740.76                             Plant Pot   

        price  quantity  
0    32999.99         3  
1     1999.99         2  
2       24.99         5  
3      129.99         5  
4       44.99         4  
..        ...       ...  
116     11.99         5  
117    179.99         1  
118  31999.99         4  
119      2.29         4  
120     14.99         4  

[121 rows x 6 columns]

# 통계 분석, 탐색적 데이터 분석
# 브랜드별 평균 상품가 확인

frm.groupby('title')['price'].mean().sort_values(ascending=False).head()

title
Charger SXT RWD            32999.99
Pacifica Touring           31999.99
300 Touring                28999.99
Rolex Cellini Moonphase    15999.99
Rolex Submariner Watch     13999.99
Name: price, dtype: float64

# 사용자별 총 구매액
users = frm.groupby('userId')['total'].sum().sort_values(ascending=False).head()

import matplotlib.pyplot as plt

users.plot(kind='bar', figsize=(10,4), title="user purchase (USD)")
plt.show()

import numpy  as np 
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt 
import json   

# warning 제거
import warnings
warnings.filterwarnings('ignore')

# version check 
print('numpy  version - ' , np.__version__)
print('pandas version - ' , pd.__version__)

# 데이터 정보 출력 함수 
def aryInfo(ary) : 
    print('type - ' , type(ary)) 
    print('shape - ' , ary.shape)
    print('ndim  - ' , ary.ndim)
    print('dtype - ' , ary.dtype)
    print()
    print('data  -')
    print(ary)

def seriesInfo(s) :
    print('type   - ' , type(s)) 
    print('index  - ' , s.index)
    print('values - ' , s.values)
    print('dtype  - ' , s.dtype)
    print()
    print('data   - ')
    print(s)

def frmInfo(frm) :
    print('type    - ' , type(frm))
    print('shape   - ' , frm.shape)
    print('ndim    - ' , frm.ndim)
    print('row idx - ' , frm.index , type(frm.index))
    print('col idx - ' , frm.columns , type(frm.columns))
    print('values  - ' , type(frm.values))
    print(frm.values)
    print('data - ') 
    print(frm)

numpy  version -  2.1.3
pandas version -  2.2.3

# 프레임은 데이터가 있는 열의 집합
scores = {
    'kor' : [90,85,100,88,78],
    'eng' : [90,85,100,88,78],
    'mat' : [90,85,100,88,78]
} # 프레임으로 만들면 행이 5개 열이 3개 생성

frm = pd.DataFrame(scores,
                  index = ['강승우', '최호준', '임정섭', '이현우', '오신호'])

frmInfo(frm)

type    -  <class 'pandas.core.frame.DataFrame'>
shape   -  (5, 3)
ndim    -  2
row idx -  Index(['강승우', '최호준', '임정섭', '이현우', '오신호'], dtype='object') <class 'pandas.core.indexes.base.Index'>
col idx -  Index(['kor', 'eng', 'mat'], dtype='object') <class 'pandas.core.indexes.base.Index'>
values  -  <class 'numpy.ndarray'>
[[ 90  90  90]
 [ 85  85  85]
 [100 100 100]
 [ 88  88  88]
 [ 78  78  78]]
data - 
     kor  eng  mat
강승우   90   90   90
최호준   85   85   85
임정섭  100  100  100
이현우   88   88   88
오신호   78   78   78

frm

# Quiz
# 모든 학생의 과목평균 점수를 새로운 열('mean') 추가하고 싶다면?
# axis = 0 (열), axis = 1 (행)
frm.values

array([[ 90,  90,  90],
       [ 85,  85,  85],
       [100, 100, 100],
       [ 88,  88,  88],
       [ 78,  78,  78]])

frm['mean'] = np.mean(frm.values, axis=1).astype(np.int32)
frm

# Quiz
# 최호준 학생의 영어점수를 90점으로 수정하고 평균 점수도 다시 계산
# iloc[], loc[] 행 인덱싱에서 사용하는 함수

frm.loc['최호준', 'eng'] = 90
frm['mean'] = np.mean(frm.values, axis=1).astype(np.int32)
frm

# 시리즈
lee = frm.loc['이현우']
print(lee)
print('type - ', type(lee))

kor     88
eng     88
mat     88
mean    88
Name: 이현우, dtype: int64
type -  <class 'pandas.core.series.Series'>

# 데이터 프레임
lee = frm.loc[['이현우']]
print(lee)
print('type - ', type(lee))

     kor  eng  mat  mean
이현우   88   88   88    88
type -  <class 'pandas.core.frame.DataFrame'>

lim  = frm.loc['임정섭', 'kor' : 'eng']
print(lim)
print('type - ', type(lim))

kor    100
eng    100
Name: 임정섭, dtype: int64
type -  <class 'pandas.core.series.Series'>

titanicRawData = sns.load_dataset('titanic')
print('type - ', type(titanicRawData))
titanicRawData.head()

type -  <class 'pandas.core.frame.DataFrame'>

titanicRawData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
 15  age_by_10    891 non-null    int64   
dtypes: bool(2), category(2), float64(2), int64(5), object(5)
memory usage: 87.6+ KB

# Quiz
# 선실등급(pclass)의 인원수를 확인하고 싶다면?

print('유일값 확인 - unique()')
print(titanicRawData['pclass'].unique())
print(titanicRawData['pclass'].value_counts().values)

유일값 확인 - unique()
[3 1 2]
[491 216 184]

# 데이터 프레임의 컬럼명 확인
print('type - ', type(titanicRawData.columns))
print(titanicRawData.columns)

type -  <class 'pandas.core.indexes.base.Index'>
Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

# Quiz
# 기존나이에 10살을 더해서 age_by_10 열을 추가하고 싶다면?


titanicRawData['age_by_10'] = (titanicRawData['age'].values + 10).astype('int')
titanicRawData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
 15  age_by_10    891 non-null    int64   
dtypes: bool(2), category(2), float64(2), int64(5), object(5)
memory usage: 87.6+ KB

# age_by_10 컬럼을 삭제하고 싶다면?
# drop( , axis = 1, inplace = True)
titanicRawData.drop(['age_by_10'], axis = 1 , inplace = True)

titanicRawData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB

# 요금(fare)에 대한 통계(최대, 최소, 평균, 합계) 확인이 필요하다면?
print('fare max - ', np.max(titanicRawData['fare'].values))
print('fare min - ', np.min(titanicRawData['fare'].values))
print('fare mean - ', np.mean(titanicRawData['fare'].values))

fare max -  512.3292
fare min -  0.0
fare mean -  32.204207968574636

# Quiz
# 선실등급(pclass)이 3등급인 데이터만의 subset 만들고 싶다면?
pclassSubsetFrm = titanicRawData[titanicRawData['pclass'] == 3]
print('type - ', type(pclassSubsetFrm))
print(pclassSubsetFrm)

type -  <class 'pandas.core.frame.DataFrame'>
     survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0           0       3    male  22.0      1      0   7.2500        S  Third   
2           1       3  female  26.0      0      0   7.9250        S  Third   
4           0       3    male  35.0      0      0   8.0500        S  Third   
5           0       3    male   NaN      0      0   8.4583        Q  Third   
7           0       3    male   2.0      3      1  21.0750        S  Third   
..        ...     ...     ...   ...    ...    ...      ...      ...    ...   
882         0       3  female  22.0      0      0  10.5167        S  Third   
884         0       3    male  25.0      0      0   7.0500        S  Third   
885         0       3  female  39.0      0      5  29.1250        Q  Third   
888         0       3  female   NaN      1      2  23.4500        S  Third   
890         0       3    male  32.0      0      0   7.7500        Q  Third   

       who  adult_male deck  embark_town alive  alone  
0      man        True  NaN  Southampton    no  False  
2    woman       False  NaN  Southampton   yes   True  
4      man        True  NaN  Southampton    no   True  
5      man        True  NaN   Queenstown    no   True  
7    child       False  NaN  Southampton    no  False  
..     ...         ...  ...          ...   ...    ...  
882  woman       False  NaN  Southampton    no   True  
884    man        True  NaN  Southampton    no   True  
885  woman       False  NaN   Queenstown    no  False  
888  woman       False  NaN  Southampton    no  False  
890    man        True  NaN   Queenstown    no   True  

[491 rows x 15 columns]

# Quiz
# 위 서브셋에서 성별과(sex), 생존여부(survived)만 가지는 새로운 서브셋을 만들고 싶다면?
newsubset = pclassSubsetFrm[['sex','survived']]
print(newsubset.head())

      sex  survived
0    male         0
1  female         1
2    male         0
3    male         0
4    male         0

# 인덱스 재조정
# reset_index()
pclassSubsetFrm.reset_index(inplace = True)
print(pclassSubsetFrm.head())

   index  survived  pclass     sex   age  sibsp  parch     fare embarked  \
0      0         0       3    male  22.0      1      0   7.2500        S   
1      1         1       3  female  26.0      0      0   7.9250        S   
2      2         0       3    male  35.0      0      0   8.0500        S   
3      3         0       3    male   NaN      0      0   8.4583        Q   
4      4         0       3    male   2.0      3      1  21.0750        S   

   class    who  adult_male deck  embark_town alive  alone  
0  Third    man        True  NaN  Southampton    no  False  
1  Third  woman       False  NaN  Southampton   yes   True  
2  Third    man        True  NaN  Southampton    no   True  
3  Third    man        True  NaN   Queenstown    no   True  
4  Third  child       False  NaN  Southampton    no  False

print(pclassSubsetFrm)

     index  survived  pclass     sex   age  sibsp  parch     fare embarked  \
0        0         0       3    male  22.0      1      0   7.2500        S   
1        1         1       3  female  26.0      0      0   7.9250        S   
2        2         0       3    male  35.0      0      0   8.0500        S   
3        3         0       3    male   NaN      0      0   8.4583        Q   
4        4         0       3    male   2.0      3      1  21.0750        S   
..     ...       ...     ...     ...   ...    ...    ...      ...      ...   
486    486         0       3  female  22.0      0      0  10.5167        S   
487    487         0       3    male  25.0      0      0   7.0500        S   
488    488         0       3  female  39.0      0      5  29.1250        Q   
489    489         0       3  female   NaN      1      2  23.4500        S   
490    490         0       3    male  32.0      0      0   7.7500        Q   

     class    who  adult_male deck  embark_town alive  alone  
0    Third    man        True  NaN  Southampton    no  False  
1    Third  woman       False  NaN  Southampton   yes   True  
2    Third    man        True  NaN  Southampton    no   True  
3    Third    man        True  NaN   Queenstown    no   True  
4    Third  child       False  NaN  Southampton    no  False  
..     ...    ...         ...  ...          ...   ...    ...  
486  Third  woman       False  NaN  Southampton    no   True  
487  Third    man        True  NaN  Southampton    no   True  
488  Third  woman       False  NaN   Queenstown    no  False  
489  Third  woman       False  NaN  Southampton    no  False  
490  Third    man        True  NaN   Queenstown    no   True  

[491 rows x 16 columns]

pclassSubsetFrm.drop('index', axis = 1 , inplace =True)
pclassSubsetFrm

# set_index : 특정 컬럼을 인덱스로 변경하는 함수

pclassSubsetFrm.reset_index(inplace=True)
pclassSubsetFrm

pclassSubsetFrm.set_index('index', inplace=True)
pclassSubsetFrm

# Quiz
# 원본 데이터로부터 나이가 60이상이고 선실등급이 1등급이면서 성별이 여자인 데이터만 추출하여 서브셋을 만들고 싶다면?

sub = titanicRawData[(titanicRawData['age'] >= 60) & (titanicRawData['pclass'] == 1) & (titanicRawData['sex'] == 'female')]
print(sub)

     survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
275         1       1  female  63.0      1      0  77.9583        S  First   
366         1       1  female  60.0      1      0  75.2500        C  First   
829         1       1  female  62.0      0      0  80.0000      NaN  First   

       who  adult_male deck  embark_town alive  alone  
275  woman       False    D  Southampton   yes  False  
366  woman       False    D    Cherbourg   yes  False  
829  woman       False    B          NaN   yes   True

sub = titanicRawData[(titanicRawData['age'] >= 60) & (titanicRawData['pclass'] == 1) & (titanicRawData['sex'] == 'male')]
print(sub)

     survived  pclass   sex   age  sibsp  parch      fare embarked  class  \
54          0       1  male  65.0      0      1   61.9792        C  First   
96          0       1  male  71.0      0      0   34.6542        C  First   
170         0       1  male  61.0      0      0   33.5000        S  First   
252         0       1  male  62.0      0      0   26.5500        S  First   
438         0       1  male  64.0      1      4  263.0000        S  First   
456         0       1  male  65.0      0      0   26.5500        S  First   
493         0       1  male  71.0      0      0   49.5042        C  First   
545         0       1  male  64.0      0      0   26.0000        S  First   
555         0       1  male  62.0      0      0   26.5500        S  First   
587         1       1  male  60.0      1      1   79.2000        C  First   
625         0       1  male  61.0      0      0   32.3208        S  First   
630         1       1  male  80.0      0      0   30.0000        S  First   
694         0       1  male  60.0      0      0   26.5500        S  First   
745         0       1  male  70.0      1      1   71.0000        S  First   

     who  adult_male deck  embark_town alive  alone  
54   man        True    B    Cherbourg    no  False  
96   man        True    A    Cherbourg    no   True  
170  man        True    B  Southampton    no   True  
252  man        True    C  Southampton    no   True  
438  man        True    C  Southampton    no  False  
456  man        True    E  Southampton    no   True  
493  man        True  NaN    Cherbourg    no   True  
545  man        True  NaN  Southampton    no   True  
555  man        True  NaN  Southampton    no   True  
587  man        True    B    Cherbourg   yes  False  
625  man        True    D  Southampton    no   True  
630  man        True    A  Southampton   yes   True  
694  man        True  NaN  Southampton    no   True  
745  man        True    B  Southampton    no  False

# 원본데이터로부터 승객의 나이를 기준으로 내림차순한 서브셋을 만들고 싶다면?
subsetFrm = titanicRawData.sort_values(by='age', ascending=False)
subsetFrm.reset_index(inplace = True)
subsetFrm.drop('index', axis = 1, inplace = True)
subsetFrm

print('성별에 따른 승객수를 시각화하기 위해서 정렬을 한다면 - ')
titanicRawData['sex'].value_counts().sort_values(ascending = False)

성별에 따른 승객수를 시각화하기 위해서 정렬을 한다면 -

sex
male      577
female    314
Name: count, dtype: int64

titanicRawData.sort_index(ascending= False)

namesFrm = pd.read_csv('./data/year2022_baby_name.csv',
                      sep=',',
                      encoding='utf-8')
print('type - ', type(namesFrm))

type -  <class 'pandas.core.frame.DataFrame'>

namesFrm.head()

namesFrm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33838 entries, 0 to 33837
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   NAME    33838 non-null  object
 1   GENDER  33838 non-null  object
 2   COUNT   33838 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 793.2+ KB

# namesFrm.describe() # 숫자데이터 통계정보
namesFrm.columns

Index(['NAME', 'GENDER', 'COUNT'], dtype='object')

# Quiz
# count 열을 기준으로 내림차순 정렬하여 서브셋을 만든다면?
# 인덱스를 확인하고 reset시킨후
# 불필요한 'index' 열 삭제

subset = namesFrm.sort_values(by='COUNT', ascending=False)
subset.reset_index(inplace = True)
subset.drop('index', axis = 1, inplace = True)
subset

# Quiz
# 열 이름을 변경하고자 하고( NAME -> name, GENDER -> gender , COUNT -> count)
# 성별(gender)이 남자인 데이터를 추출한다면?

# type(subset.columns.values)

# for col in subset.columns.values:
#     print(col.lower())

cols = [ col.lower() for col in subset.columns.values]
subset.columns = cols
print(subset)

# subset.rename(columns = cols, inplace = True) # 리스트 형태라서 안됨 딕셔너리로 만들어줘야함
# subset[subset['gender']=='M']

subset.rename(columns = {col: col.lower() for col in subset.columns}, inplace = True) # 딕셔너리 컴프리헨션
subset[subset['gender']=='M']

# subset.rename(columns={'NAME': 'name', 'GENDER': 'gender', 'COUNT': 'count'}, inplace=True)
# M = subset[subset['gender'] == 'M']
# print(M)

           name gender  count
0      Isabella      F  22731
1         Jacob      M  21875
2        Sophia      F  20477
3         Ethan      M  17866
4          Emma      F  17179
...         ...    ...    ...
33833     Xaine      M      5
33834    Xaveon      M      5
33835   Xavious      M      5
33836    Xiomar      M      5
33837     Xylan      M      5

[33838 rows x 3 columns]

frm = pd.read_csv('./data/service_data_groupby_sample.csv',
                      encoding='cp949')
print('type - ', type(frm))
frm

type -  <class 'pandas.core.frame.DataFrame'>

frm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      12 non-null     int64 
 1   gender  12 non-null     object
 2   height  12 non-null     int64 
 3   age     12 non-null     int64 
 4   region  12 non-null     object
dtypes: int64(3), object(2)
memory usage: 612.0+ bytes

# Quiz
# 지역별 나이평균을 확인하고 싶다면??

# type(frm.groupby('region').get_group('경기'))
# type(frm.groupby('region').get_group('경기')['age'])

frm.groupby('region')['age'].mean()

region
경기    32.00
서울    28.25
인천    39.00
충북    33.00
Name: age, dtype: float64

# Quiz
# 성별을 기준으로 그룹을 나누고 싶다면 -
tmp = frm.groupby('gender')[['height']].mean()
tmp.reset_index(inplace = True)
tmp

# 다중통계량 : agg()
subset = frm.drop('region', axis = 1)
subset

subset.groupby('gender').agg(['mean', 'var', 'std'])

frm.groupby('gender')['age'].agg(['max', 'min', 'mean', 'median']).reset_index()

# Quiz
# 성별에 따른 거주지의 최빈값(mode())을 확인하고 싶다면?

# frm.groupby('gender')['region'].agg(lambda x : x.mode())
frm.groupby('gender')['region'].apply(lambda x : x.mode())

gender   
남자      0    서울
여자      0    서울
Name: region, dtype: object

tipsFrm = sns.load_dataset('tips')
tipsFrm.head()

tipsFrm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB

tipsFrm.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

titanicFrm = sns.load_dataset('titanic')
titanicFrm.head()

# Quiz
# tipsFrm 전체 평균 팁은 얼마일까요?

tip = tipsFrm['tip'].mean()
print(tip)

2.99827868852459

# Quiz
# 남성과 여성 중 평균 팁이 더 높은 성별을 확인하고 싶다면?

print(tipsFrm.groupby('sex')['tip'].agg('mean').reset_index())

# 흡연자와 비흡연자 중 편균 팁 비율이 높은 그룹은?
# assign
print(tipsFrm.assign(pct = tipsFrm['tip'] / tipsFrm['total_bill']).groupby('smoker')['pct'].mean())

      sex       tip
0    Male  3.089618
1  Female  2.833448
smoker
Yes    0.163196
No     0.159328
Name: pct, dtype: float64

# Quiz
# 팁이 가장 많이 발생하는 요일을 확인하고 싶다면?

result = tipsFrm.groupby('day')['tip'].sum().sort_values(ascending=False).head(1)
print('type - ', type(result))
print(result)

type -  <class 'pandas.core.series.Series'>
day
Sat    260.4
Name: tip, dtype: float64

# Quiz
# ['Dinner', 'Lunch'] 중 평균 팁 비율이 높은 시간대를 확인하고 싶다면?

# tipsFrm['time'].unique()
# tipsFrm.groupby('time')['tip'].mean()

tipsFrm.assign(pct = tipsFrm['tip'] / tipsFrm['total_bill']).groupby('time')['pct'].mean().sort_values(ascending=False).head(1)

tipsFrm['pct'] = tipsFrm['tip'] / tipsFrm['total_bill']
result = tipsFrm.groupby('time')['pct'].mean().sort_values(ascending = False).head(1)
print(result)

time
Lunch    0.164128
Name: pct, dtype: float64

# Quiz : titanicFrm
# subset - (age,sex,class,fare,survived)

# subset = titanicFrm[['age', 'sex', 'class', 'fare', 'survived']]
# subset.head()

subset = titanicFrm.loc[: ,['age', 'sex', 'class', 'fare', 'survived']]
subset.head()

# Quiz 
# 선실등급에 따른 그룹을 만들고 1등급 승객만 데이터 프레임 형식으로 만들어 본다면?

# firstClass = subset[subset['class'] == 'First']
# print(firstClass) 

grp = subset.groupby('class')
# print(grp)
# print(grp.get_group('First'))

result = grp.get_group('First')
print(result)

#subset.loc[grp.groups['First'].values , : ]

      age     sex  class     fare  survived
1    38.0  female  First  71.2833         1
3    35.0  female  First  53.1000         1
6    54.0    male  First  51.8625         0
11   58.0  female  First  26.5500         1
23   28.0    male  First  35.5000         1
..    ...     ...    ...      ...       ...
871  47.0  female  First  52.5542         1
872  33.0    male  First   5.0000         0
879  56.0  female  First  83.1583         1
887  19.0  female  First  30.0000         1
889  26.0    male  First  30.0000         1

[216 rows x 5 columns]

irisFrm = sns.load_dataset('iris')
irisFrm.head()

irisFrm['species'].value_counts()

species
setosa        50
versicolor    50
virginica     50
Name: count, dtype: int64

grp = irisFrm.groupby('species')
print(grp)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000014B9D9DF050>

print(grp.groups)

{'setosa': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49], 'versicolor': [50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99], 'virginica': [100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149]}

for key, group in grp :
    print('key - ', key)
    print()
    display(group)

key -  setosa

key -  versicolor

key -  virginica

irisFrm.sort_values(by='petal_length', ascending = False).groupby('species').get_group('setosa')

titanicFrm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB

print('age 결측값 - ')
print(titanicFrm['age'].isnull().sum())

age 결측값 - 
177

titanicFrm.groupby('sex')['age'].mean()

sex
female    27.915709
male      30.726645
Name: age, dtype: float64

titanicFrm.groupby('sex')['age'].groups

{'female': [1, 2, 3, 8, 9, 10, 11, 14, 15, 18, 19, 22, 24, 25, 28, 31, 32, 38, 39, 40, 41, 43, 44, 47, 49, 52, 53, 56, 58, 61, 66, 68, 71, 79, 82, 84, 85, 88, 98, 100, 106, 109, 111, 113, 114, 119, 123, 128, 132, 133, 136, 140, 141, 142, 147, 151, 156, 161, 166, 167, 172, 177, 180, 184, 186, 190, 192, 194, 195, 198, 199, 205, 208, 211, 215, 216, 218, 229, 230, 233, 235, 237, 240, 241, 246, 247, 251, 254, 255, 256, 257, 258, 259, 264, 268, 269, 272, 274, 275, 276, ...], 'male': [0, 4, 5, 6, 7, 12, 13, 16, 17, 20, 21, 23, 26, 27, 29, 30, 33, 34, 35, 36, 37, 42, 45, 46, 48, 50, 51, 54, 55, 57, 59, 60, 62, 63, 64, 65, 67, 69, 70, 72, 73, 74, 75, 76, 77, 78, 80, 81, 83, 86, 87, 89, 90, 91, 92, 93, 94, 95, 96, 97, 99, 101, 102, 103, 104, 105, 107, 108, 110, 112, 115, 116, 117, 118, 120, 121, 122, 124, 125, 126, 127, 129, 130, 131, 134, 135, 137, 138, 139, 143, 144, 145, 146, 148, 149, 150, 152, 153, 154, 155, ...]}

tmp = titanicFrm.groupby('sex')['age'].apply(lambda x : x.fillna(x.mean()))

titanicFrm['age'] = tmp.values
print('age 결측값 - ')
print(titanicFrm['age'].isnull().sum())

age 결측값 - 
0

titanicFrm

import numpy  as np 
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt 
import json   

# warning 제거
import warnings
warnings.filterwarnings('ignore')

# version check 
print('numpy  version - ' , np.__version__)
print('pandas version - ' , pd.__version__)

# 데이터 정보 출력 함수 
def aryInfo(ary) : 
    print('type - ' , type(ary)) 
    print('shape - ' , ary.shape)
    print('ndim  - ' , ary.ndim)
    print('dtype - ' , ary.dtype)
    print()
    print('data  -')
    print(ary)

def seriesInfo(s) :
    print('type   - ' , type(s)) 
    print('index  - ' , s.index)
    print('values - ' , s.values)
    print('dtype  - ' , s.dtype)
    print()
    print('data   - ')
    print(s)

def frmInfo(frm) :
    print('type    - ' , type(frm))
    print('shape   - ' , frm.shape)
    print('ndim    - ' , frm.ndim)
    print('row idx - ' , frm.index , type(frm.index))
    print('col idx - ' , frm.columns , type(frm.columns))
    print('values  - ' , type(frm.values))
    print(frm.values)
    print('data - ') 
    print(frm)

numpy  version -  2.1.3
pandas version -  2.2.3

%matplotlib inline

# 한글 폰트 문제 해결
import platform
from matplotlib import font_manager, rc
plt.rcParams['axes.unicode_minus'] = False

if platform.system() == 'Darwin':
    rc('font', family='AppleGothic')
elif platform.system() == 'Windows':
    path = "c:/Windows/Fonts/malgun.ttf"
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
else:
    print('Unknown system... sorry~~~~') 


# 차트 축 <- 음수 부호 지원
import matplotlib
matplotlib.rcParams['axes.unicode_minus'] = False

plt.figure()

# plt.plot([1,2,3,4,5,6,7,8,9])
# plt.plot([1,4,9,5,6,7,2,7,9])

plt.plot([10, 30, 60, 90], [1,4,9,16], color = 'red', marker ='o', ms = 15)

plt.title('라인 플롯 - ')
plt.xlabel('x 축')
plt.ylabel('y 축', rotation = 45)

plt.xlim(0, 100)
plt.ylim(0,17)

plt.grid()
plt.show()
plt.close()

# 서브플롯을 이용해서 한 화면에 여러개의 플롯을 그릴 수 있다.

fig = plt.figure(figsize = (20,7))

area01 = fig.add_subplot(1, 3, 1)
area01.set_title('타이틀')
area01.set_xlabel('x 축')
area01.set_ylabel('y 축', rotation=0)

area02 = fig.add_subplot(1, 3, 2)
area02.set_title('타이틀')
area02.set_xlabel('x 축')
area02.set_ylabel('y 축', rotation=0)

area03 = fig.add_subplot(1, 3, 3)
area03.set_title('타이틀')
area03.set_xlabel('x 축')
area03.set_ylabel('y 축', rotation=0)

plt.show()
plt.close()

print('bar char : x 축이 범주형타입(category)')
titanicFrm = sns.load_dataset('titanic')
titanicFrm.info()

bar char : x 축이 범주형타입(category)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB

# Quiz
# 선실등급별 생존자 합을 시각화 한다면?
# titanicFrm.groupby('pclass')['survived'].groups
# titanicFrm.groupby('pclass')['survived'].sum().index
titanicFrm.groupby('pclass')['survived'].sum().values

array([136,  87, 119])

plt.figure(figsize = (15,5))

plt.bar(titanicFrm.groupby('pclass')['survived'].sum().index, 
        titanicFrm.groupby('pclass')['survived'].sum().values)

plt.xticks(titanicFrm.groupby('pclass')['survived'].sum().index)

plt.title('선실 등급별 생존자 - ')
plt.xlabel('선실등급')
plt.ylabel('선실 등급별 생존자', rotation = 45)

plt.show()
plt.close()

# 간단한 시각화를 위해서 더미 데이터 세트를 만들어 보자
# 로그인 로그 데이터(timestamp, user, ip, status, delay_ms)

# timestamp : pd.date_range('2025-11-06', periods=100, freq='H')
# user : np.random.choice(['admin', 'superAdmin', 'root', 'guest', 'analyst'], 100)
# ip : np.random.choice(['192.168.0.1', '192.168.0.3', '192.168.0.5', '192.168.0.7', '192.168.0.9'], 100)
# status : np.random.choice(['success', 'fail'], 100, p=[0.6, 0.4])
# delay_ms : np.random.randint(20, 800, 100)

frm = pd.DataFrame({
    "timestamp" : pd.date_range('2025-11-06', periods=100, freq='H'),
    "user" : np.random.choice(['admin', 'superAdmin', 'root', 'guest', 'analyst'], 100),
    "ip" : np.random.choice(['192.168.0.1', '192.168.0.3', '192.168.0.5', '192.168.0.7', '192.168.0.9'], 100),
    "status" : np.random.choice(['success', 'fail'], 100, p=[0.6, 0.4]),
    "delay_ms" : np.random.randint(20, 800, 100)
})
frm.head()

# Quiz
# 로그인 시도 상태별 횟수를 bar plot 이용하여 시각화

plt.figure(figsize = (15,5))

plt.bar(frm['status'].value_counts().index, 
        frm['status'].value_counts().values,
        color = ['green', 'red'])

plt.xticks(frm['status'].value_counts().index)

plt.title('로그인 시도 상태 - ')
plt.xlabel('상태')
plt.ylabel('시도횟수', rotation = 45)

plt.show()
plt.close()

# frm.groupby('status').count()
# frm['status'].value_counts()

# Quiz
# 시간대별 평균 지연시간을 line plot 시각화 -
# type(frm['timestamp'])
hour = frm['timestamp'].dt.hour

plt.figure(figsize=(15,5))

plt.plot(frm.groupby(frm['timestamp'].dt.hour)['delay_ms'].mean().index,
          frm.groupby(frm['timestamp'].dt.hour)['delay_ms'].mean().values)

plt.xticks(frm.groupby(frm['timestamp'].dt.hour)['delay_ms'].mean().index)

plt.title('시간대별 평균 지연시간')
plt.xlabel('시간대')
plt.ylabel('지연시간',rotation = 0)

plt.show()
plt.close()

irisFrm = sns.load_dataset('iris')
irisFrm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB

irisFrm.head()

# Quiz
# 품종을 기준으로 그룹화 후 막대그래프로 시각화
speciesFrm = irisFrm.groupby('species').mean()

plt.figure()

#speciesFrm.plot(kind='bar')
speciesFrm.T.plot(kind='bar')

plt.legend(loc = 'best')
plt.xticks(rotation = 0)
plt.show()
plt.close()

<Figure size 640x480 with 0 Axes>

# histogram
# 연속형 데이터의 분포(distribution), 일정한 구간(bin)
# 해당 구간에 포함되는 데이터의 개수를 세어서 막대형태로 표현
# Quiz 로그인 지연 분포 확인


plt.figure(figsize=(15,5))

plt.hist(frm['delay_ms'], bins=20)

plt.title('로그인 지연 분포')
plt.xlabel('delay(ms)')
plt.ylabel('Freq',rotation = 0)

plt.show()
plt.close()

# countplot
# 사용자별 로그인 시도 패턴

plt.figure(figsize = (15,5))

sns.countplot(x = 'user',hue = 'status', data = frm, palette = 'coolwarm')

plt.show()
plt.close()

# box plot : 이상치(outlier) 탐지를 위한 시각화 도구
# 데이터의 중심(median), 퍼짐(사분위수), 이상치(outlier)를 한눈에 보여줌
# Q1(25%), Q2(50%), Q3(75%)
# IQR(Inner Quartile Range) : Q3 - Q1
# lower bound = Q1 - 1.5 * IQR , upper bound : Q3 + 1.5 * IQR
# 판정기준 값 < lower bound : 하한 이상치 , 값 > upper bound : 상한 이상치
# whisker(수염) : IQR 1.5배 범위 내 데이터

boxFrm = pd.DataFrame({
    
    "user" : np.random.choice(['admin', 'root', 'guest'], 100),
    "delay_ms" : np.concatenate([
                    np.random.normal(200, 50, 80),
                    np.random.normal(800, 20, 10),
                    np.random.normal(100, 20, 10)
                ])
})
boxFrm.head()

# 정규분포 더미 데이터
# np.random.normal(200, 50, 80)
# np.random.normal(800, 20, 10)
# np.random.normal(100, 20, 10)

np.concatenate([
    np.random.normal(200, 50, 80),
    np.random.normal(800, 20, 10),
    np.random.normal(100, 20, 10)
])

array([292.51640763, 168.57420747, 290.69907913, 244.46286972,
       177.34253952, 217.46538652, 129.25760432, 205.74352218,
       248.47891827, 268.54624142, 173.97877908, 210.92703789,
       141.63760471, 160.8589941 , 199.30024495, 267.19403124,
       135.90760527, 242.63479587, 209.26242116, 263.14823025,
       200.09712326, 176.7684967 ,  99.91410234, 271.96073546,
       284.21562685,  93.86541221, 149.01866832, 204.04893058,
        84.11781713, 144.5350304 , 181.48100725, 236.14604557,
       129.17758147, 150.0605812 , 184.9505203 , 238.19389845,
       206.05383098, 188.15616798, 198.81813538, 178.78378535,
       254.04604323, 175.23122605, 221.80939747, 285.99845825,
       279.03247656, 272.61639641, 144.92982197, 219.8355202 ,
       125.3305636 , 183.84997572, 226.30474703, 224.62590732,
       193.49746878, 258.78868177, 152.21134589, 173.43409163,
       189.21475311, 190.51262847, 298.0115558 , 220.34321991,
       178.0259904 , 238.45438115, 234.10971496,  84.28255466,
       156.5164555 , 185.31665669, 139.70191482, 169.02536862,
       125.87790124, 226.99320902, 245.44084644, 197.65665821,
       224.25296061, 197.56926419, 304.57799871, 272.67420392,
       295.74316342, 230.25838562, 268.21286954, 177.63697886,
       795.61537162, 837.26440743, 783.02674952, 764.35211196,
       797.14154998, 800.56453445, 795.74691275, 797.2859874 ,
       791.94023827, 765.83006055,  99.58661875, 129.57059035,
       125.54431818,  53.75850928, 105.25341529,  88.87525094,
       146.77941659,  98.99170167,  72.98629229,  89.4745997 ])

boxFrm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   user      100 non-null    object 
 1   delay_ms  100 non-null    float64
dtypes: float64(1), object(1)
memory usage: 1.7+ KB

boxFrm['delay_ms'].describe()

count    100.000000
mean     256.293914
std      196.997663
min       52.234947
25%      159.336635
50%      201.314419
75%      252.315077
max      864.892806
Name: delay_ms, dtype: float64

# IQR
Q1 = boxFrm['delay_ms'].quantile(0.25)
print('Q1 - ', Q1)
Q3 = boxFrm['delay_ms'].quantile(0.75)
print('Q3 - ', Q3)
IQR = Q3 - Q1
print('IQR - ', IQR)

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
print(f'lower {lower_bound} , upper {upper_bound}')

Q1 -  159.33663457474654
Q3 -  252.31507680778475
IQR -  92.97844223303821
lower 19.868971225189227 , upper 391.78274015734206

print('이상치 탐지 - ')
outliers = boxFrm[ (boxFrm['delay_ms'] < lower_bound) | (boxFrm['delay_ms'] > upper_bound) ]
print(outliers)

이상치 탐지 - 
     user    delay_ms
80  admin  864.892806
81   root  821.593476
82  admin  839.315056
83   root  789.554091
84  admin  804.741641
85  admin  777.361475
86  admin  823.628389
87  admin  830.470144
88  guest  790.787516
89  admin  828.703043

# box plot : 이상치(outlier) 탐지를 위한 시각화 도구
# 로그인 지연시간 이상치 탐지

plt.figure(figsize = (15, 5))

sns.boxplot(x = 'delay_ms', data = boxFrm, color = 'gray')
sns.stripplot(x = 'delay_ms', data = boxFrm, color = 'red', jitter = True, alpha = 0.5)

plt.show()
plt.close()

# 산점도(scatter plot)
# 두 개의 연속형 변수 간의 관계를 시각화
# x : 독립변수(feature), y : 종속변수(target)
# 퍼짐의 정도
# 점들이 어떤 패턴(선형, 곡선, 군집)을 이루는지 보면서 변수간의 관계를 파악하기 위한 시각화

plt.figure()

x = [1,2,3,4,5,6,7,8,9]
y = [1,4,9,5,6,7,2,7,9]

plt.scatter(x,y, color = 'red', s = 5, alpha = 0.7, marker = 'o')

plt.grid(False)
plt.show()
plt.close()

# Quiz
# 사용자별 로그인 시도 패턴을 산점도로 시각화하고 싶다 (시도횟수)
# 각 점은 : 사용자
# x : 평균 로그인 지연시간
# y : 실패율(failRatio)

# insight : 비정상적인 사용자 행동 패턴을 탐지할 수 있다.

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001CBA2687890>

scatterFrm = pd.DataFrame({
    "timestamp" : pd.date_range('2025-11-06', periods=100, freq='H'),
    "user" : np.random.choice(['admin', 'superAdmin', 'root', 'guest', 'analyst'], 100),
    "ip" : np.random.choice(['192.168.0.1', '192.168.0.3', '192.168.0.5', '192.168.0.7', '192.168.0.9'], 100),
    "status" : np.random.choice(['success', 'fail'], 100, p=[0.6, 0.4]),
    "delay_ms" : np.random.randint(20, 800, 100)
})
scatterFrm.head()

# plt.figure()
# x = scatterFrm.groupby('user')['delay_ms'].mean()
# y = scatterFrm.groupby('user')['status'].apply(lambda x : x.value_counts()['fail']/x.value_counts().sum())
# plt.scatter(x, y, color='red', s=5, marker='o')
# plt.show()
# plt.close()

avg = scatterFrm.groupby('user')['delay_ms'].mean()
# print(x)
failRatio = scatterFrm.groupby('user')['status'].apply(lambda x : (x=='fail').mean())
# print(y)
attempts = scatterFrm['user'].value_counts()
# print(scatterFrm['user'].value_counts())
userStatus = pd.DataFrame({
    'avg' : avg,
    'failRatio' : failRatio,
    'attempts' : attempts
});

plt.figure(figsize = (15,5))

sns.scatterplot(x='avg',
                y='failRatio',
                data = userStatus,
                size = 'attempts',
                hue='user')

plt.show()
plt.close()

# heatmap
corr = irisFrm.corr(numeric_only=True)
print(corr)

plt.figure(figsize=(15,5))

sns.heatmap(corr, fmt='.2f', annot=True, linewidth=0.5)
plt.show()
plt.close()

              sepal_length  sepal_width  petal_length  petal_width
sepal_length      1.000000    -0.117570      0.871754     0.817941
sepal_width      -0.117570     1.000000     -0.428440    -0.366126
petal_length      0.871754    -0.428440      1.000000     0.962865
petal_width       0.817941    -0.366126      0.962865     1.000000

# frm 데이터를 이용해서 히트맵 시각화
# Quiz
# 사용자-상태별 평균 지연시간

pivot = frm.pivot_table(index='user', columns='status', values='delay_ms', aggfunc='mean')
# print(pivot)

plt.figure(figsize=(15,5))

sns.heatmap(pivot, fmt='.2f', annot=True, linewidth=0.5)
plt.show()
plt.close()

# Quiz

mpgFrm = pd.read_excel('./data/mpg_visualization.xlsx',
                      index_col = 0)
mpgFrm.head()

# print('Q1) 배기량(displ)에 따른 고속연비를 확인하고 한다')
# print('배기량 4 이하인 자동차와 5이상인 자동차 중 고속도로 평균연비가 높은지를 확인한다면')

avg = mpgFrm.groupby(mpgFrm['displ'] >= 5)['hwy'].mean()
#print(avg)

plt.figure(figsize=(15,5))

avg.index = ['5미만', '5이상']

plt.bar(avg.index,
           avg.values)

plt.show()
plt.close()

# print('Q2) 자동차 제조사에 따른 도시 연비를 비교할려고 한다')
# print('audi , toyota 두 회사의 모든 차종에 대한 도시연비 평균을 비교 - ') 


audi = mpgFrm[mpgFrm['manufacturer'] == 'audi']['cty'].mean()
toyota = mpgFrm[mpgFrm['manufacturer'] == 'toyota']['cty'].mean()

avg = pd.Series([audi, toyota], index=['audi', 'toyota'])

plt.figure(figsize=(15,5))
plt.bar(avg.index, avg.values)

plt.show()
plt.close()

# print('Q3) chevrolet, ford, honda 제조사의 모든 차종에 대한 고속도로 연비 평균을 시각화')

chevrolet = mpgFrm[mpgFrm['manufacturer'] == 'chevrolet']['hwy'].mean()
ford = mpgFrm[mpgFrm['manufacturer'] == 'ford']['hwy'].mean()
honda = mpgFrm[mpgFrm['manufacturer'] == 'honda']['hwy'].mean()

avg = pd.Series([chevrolet, ford, honda], index=['chevrolet', 'ford', 'honda'])

plt.figure(figsize=(15,5))
plt.bar(avg.index, avg.values)

plt.show()
plt.close()

# print('Q4)구동방식별 고속도로연비평균을 막대 그래프로 시각화 - ')


trans = mpgFrm.groupby('trans')['hwy'].mean()
print(trans)

plt.figure(figsize=(15,5))
plt.bar(trans.index, trans.values)

plt.show()
plt.close()

trans
auto(av)      27.800000
auto(l3)      27.000000
auto(l4)      21.963855
auto(l5)      20.717949
auto(l6)      20.000000
auto(s4)      25.666667
auto(s5)      25.333333
auto(s6)      25.187500
manual(m5)    26.293103
manual(m6)    24.210526
Name: hwy, dtype: float64

# print('Q5) 구동방식별 고속도로, 도시연비 평균을 서브셋을 만들고')
# print('시각화 - multi bar ') 

avg = mpgFrm.groupby('drv')[['cty', 'hwy']].mean()
print(avg)

avg.plot(kind='bar', figsize=(15,5))

plt.show()
plt.close()

           cty        hwy
drv                      
4    14.330097  19.174757
f    19.971698  28.160377
r    14.080000  21.000000

# print('Q6) 해당 클래스별 빈도수를 시각화 - ')

class_count = mpgFrm['class'].value_counts()
print(class_count)

plt.figure(figsize=(8,5))
class_count.plot(kind='bar')
plt.title('자동차 클래스별 빈도수')
plt.xlabel('차종 (class)')
plt.ylabel('빈도수 (count)')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()
plt.close()

class
suv           62
compact       47
midsize       41
subcompact    35
pickup        33
minivan       11
2seater        5
Name: count, dtype: int64

import numpy  as np 
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt 
import json   

# warning 제거
import warnings
warnings.filterwarnings('ignore')

# version check 
print('numpy  version - ' , np.__version__)
print('pandas version - ' , pd.__version__)

# 데이터 정보 출력 함수 
def aryInfo(ary) : 
    print('type - ' , type(ary)) 
    print('shape - ' , ary.shape)
    print('ndim  - ' , ary.ndim)
    print('dtype - ' , ary.dtype)
    print()
    print('data  -')
    print(ary)

def seriesInfo(s) :
    print('type   - ' , type(s)) 
    print('index  - ' , s.index)
    print('values - ' , s.values)
    print('dtype  - ' , s.dtype)
    print()
    print('data   - ')
    print(s)

def frmInfo(frm) :
    print('type    - ' , type(frm))
    print('shape   - ' , frm.shape)
    print('ndim    - ' , frm.ndim)
    print('row idx - ' , frm.index , type(frm.index))
    print('col idx - ' , frm.columns , type(frm.columns))
    print('values  - ' , type(frm.values))
    print(frm.values)
    print('data - ') 
    print(frm)

numpy  version -  2.1.3
pandas version -  2.2.3

%matplotlib inline

# 한글 폰트 문제 해결
import platform
from matplotlib import font_manager, rc
plt.rcParams['axes.unicode_minus'] = False

if platform.system() == 'Darwin':
    rc('font', family='AppleGothic')
elif platform.system() == 'Windows':
    path = "c:/Windows/Fonts/malgun.ttf"
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
else:
    print('Unknown system... sorry~~~~') 


# 차트 축 <- 음수 부호 지원
import matplotlib
matplotlib.rcParams['axes.unicode_minus'] = False

import folium as g

map = g.Map(location=[37.5574771, 127.0020518])

g.Marker([37.5574771, 127.0020518], popup='동국대학교').add_to(map)
g.CircleMarker([37.5574771, 127.0020518], radius = 50, color = 'red').add_to(map)
map

seoulUniFrm = pd.read_excel('./data/서울지역_대학교_위치.xlsx', index_col=0)
seoulUniFrm.head()

map = g.Map(location=[37.5574771, 127.0020518])

for name in seoulUniFrm.index:
    lat = seoulUniFrm.loc[name, '위도']
    lon = seoulUniFrm.loc[name, '경도']
    g.Marker([lat, lon], popup=name).add_to(map)

map

import plotly.express as px

# 가 데이터
frm = pd.DataFrame({
    'Country' : ['한국', '미국', '일본', '호주'],
    'Gdp' : [1000, 2000, 3000, 4000],
    'Population' : [100, 200, 300, 400]
})

fig = px.bar(frm, x= 'Country', y = 'Gdp', title='국가별 GDP')
fig.show()

fig02 = px.scatter(frm, x='Population', y= 'Gdp', hover_name='Country', title='GDP VS Population')
fig02.show()

'''
Langchain(LLM) + Rag 개발환경
python = 3.10
openai = 1.52.0
langchain = 0.2.16, langchain-core = 0.2.38, langchain-community = 0.2.16

faiss-cpu = 1.8.0, chromadb = 0.5.5

python-dotenv = 1.0.0
.env = key

conda env list (가상환경 확인)
conda create -n langchain_llm_env python=3.10 (가상환경 생성)
conda activate langchain_llm_env (가상환경 활성화)

pip install openai==1.52.0

pip install langchain==0.2.16 langchain-core==0.2.38 langchain-community==0.2.16

pip install langchain==0.2.16
pip install langchain-core==0.2.38
pip install langchain-community==0.2.16

#추후 버전 이슈가 발생하면 
pip uninstall httpx
pip install httpx==0.27.2
or
pip install --upgrade httpx==0.27.2
#

pip install faiss-cpu==1.8.0 chromadb==0.5.5

pip install tiktoken==0.7.0 pypdf==4.3.1 unstructured==0.14.10

pip install notebook==7.2.2 jupyterlab==4.2.4 ipykernel==6.29.5

pip install python-dotenv==1.0.0

pip install matplotlib==3.9.2 pandas==2.2.3 seaborn==0.13.2 streamlit==1.27.0 streamlit-audiorecorder

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opentelemetry-proto 1.38.0 requires protobuf<7.0,>=5.0, but you have protobuf 4.25.8 which is incompatible.

pip install uv
uv pip install faiss-cpu==1.8.0 chromadb==0.5.5


'''

"\nLangchain(LLM) + Rag 개발환경\npython = 3.10\nopenai = 1.52.0\nlangchain = 0.2.16, langchain-core = 0.2.38, langchain-community = 0.2.16\n\nfaiss-cpu = 1.8.0, chromadb = 0.5.5\n\npython-dotenv = 1.0.0\n.env = key\n\nconda env list (가상환경 확인)\nconda create -n langchain_llm_env python=3.10 (가상환경 생성)\nconda activate langchain_llm_env (가상환경 활성화)\n\npip install openai==1.52.0\n\npip install langchain==0.2.16 langchain-core==0.2.38 langchain-community==0.2.16\n\npip install langchain==0.2.16\npip install langchain-core==0.2.38\npip install langchain-community==0.2.16\n\n#추후 버전 이슈가 발생하면 \npip uninstall httpx\npip install httpx==0.27.2\nor\npip install --upgrade httpx==0.27.2\n#\n\npip install faiss-cpu==1.8.0 chromadb==0.5.5\n\npip install tiktoken==0.7.0 pypdf==4.3.1 unstructured==0.14.10\n\npip install notebook==7.2.2 jupyterlab==4.2.4 ipykernel==6.29.5\n\npip install python-dotenv==1.0.0\n\npip install matplotlib==3.9.2 pandas==2.2.3 seaborn==0.13.2 streamlit==1.27.0 streamlit-audiorecorder\n\nERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\nopentelemetry-proto 1.38.0 requires protobuf<7.0,>=5.0, but you have protobuf 4.25.8 which is incompatible.\n\npip install uv\nuv pip install faiss-cpu==1.8.0 chromadb==0.5.5\n\n\n"

import os
import openai

from openai import OpenAI
from dotenv import load_dotenv

# 환경변수에서 API key 읽기

load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')
if not api_key:
    raise ValueError('환경변수가 설정되지 않았습니다!!')

print(openai.__version__)
client = OpenAI(api_key=api_key)
print('client - ', client)

1.52.0
client -  <openai.OpenAI object at 0x000001C2EC651480>

# 테스트 요청(모델 목록 확인!!)
models = client.models.list()
#print(models)
print('len - ', len(models.data))
print('model - ', [models.data[idx].id for idx in range(len(models.data))])

len -  99
model -  ['dall-e-2', 'gpt-4o-mini-search-preview-2025-03-11', 'omni-moderation-latest', 'gpt-4o-mini-search-preview', 'o3-mini-2025-01-31', 'gpt-4-turbo', 'gpt-4.1', 'gpt-4.1-mini-2025-04-14', 'gpt-5-nano-2025-08-07', 'gpt-4.1-mini', 'sora-2', 'sora-2-pro', 'gpt-4-turbo-2024-04-09', 'text-embedding-3-small', 'gpt-realtime-mini', 'o3-2025-04-16', 'o4-mini-2025-04-16', 'gpt-4.1-2025-04-14', 'gpt-4o-2024-05-13', 'gpt-4o-search-preview-2025-03-11', 'gpt-4o-search-preview', 'gpt-3.5-turbo-16k', 'o1-mini', 'o1-mini-2024-09-12', 'tts-1-1106', 'gpt-4o-mini-2024-07-18', 'o3', 'o4-mini', 'o4-mini-deep-research-2025-06-26', 'codex-mini-latest', 'gpt-5-nano', 'babbage-002', 'gpt-4-turbo-preview', 'chatgpt-4o-latest', 'tts-1-hd-1106', 'gpt-4o-mini-tts', 'o1-pro-2025-03-19', 'dall-e-3', 'o1', 'davinci-002', 'tts-1-hd', 'o1-pro', 'o4-mini-deep-research', 'gpt-4o-2024-11-20', 'gpt-4-0125-preview', 'gpt-5-mini', 'gpt-5-mini-2025-08-07', 'gpt-4o-realtime-preview-2024-12-17', 'gpt-image-1', 'text-embedding-ada-002', 'gpt-4o-mini', 'o3-mini', 'gpt-5', 'gpt-4.1-nano-2025-04-14', 'gpt-4.1-nano', 'gpt-4o-realtime-preview-2025-06-03', 'gpt-4o-transcribe', 'gpt-3.5-turbo-instruct', 'gpt-3.5-turbo-instruct-0914', 'gpt-4-1106-preview', 'gpt-5-codex', 'whisper-1', 'gpt-4o', 'gpt-5-2025-08-07', 'gpt-4o-2024-08-06', 'o1-2024-12-17', 'omni-moderation-2024-09-26', 'gpt-4o-audio-preview-2025-06-03', 'gpt-4o-audio-preview', 'text-embedding-3-large', 'gpt-4', 'gpt-4-0613', 'tts-1', 'gpt-5-search-api', 'gpt-3.5-turbo', 'gpt-3.5-turbo-0125', 'gpt-realtime-mini-2025-10-06', 'gpt-4o-transcribe-diarize', 'gpt-3.5-turbo-1106', 'gpt-5-search-api-2025-10-14', 'gpt-4o-audio-preview-2024-10-01', 'gpt-4o-realtime-preview', 'gpt-5-pro', 'gpt-5-pro-2025-10-06', 'gpt-5-chat-latest', 'gpt-4o-mini-realtime-preview', 'gpt-4o-mini-audio-preview-2024-12-17', 'gpt-4o-mini-realtime-preview-2024-12-17', 'gpt-4o-mini-audio-preview', 'gpt-audio-mini', 'gpt-audio-mini-2025-10-06', 'gpt-4o-audio-preview-2024-12-17', 'gpt-4o-mini-transcribe', 'gpt-realtime-2025-08-28', 'gpt-realtime', 'gpt-audio', 'gpt-audio-2025-08-28', 'gpt-4o-realtime-preview-2024-10-01', 'gpt-image-1-mini']

# endpoint
'''
client.chat.completions (대화)
client.completions (단일 프롬포트)
client.embeddings (텍스트를 임베딩 벡터 변환)
client.images(이미지)
client.audio.transcriptions(음성, TTS, STT)
'''

# 임베딩
'''
Embedding : 텍스트(단어, 문장, 문서)를 숫자 배열로 변환하는 과정

LLM 관점에서 보면 동작 흐름
사용자 : 텍스트를 입력하면 입력된 텍스트는 임베딩벡터로 변환하고 이 값을 모델에게 전달하여 응답을 생성하는 흐름

LLM + RAG (FAISS)
사용자
텍스트를 입력하면 입력된 텍스트는 임베딩벡터로 변환하고
외부문서를 가지고 있는 벡터디비에서 검색하고 증가 생성된 값을 모델에게 전달하여 응답을 생성하는 흐름

langchain(LLM + Rag(FAISS))
'''

'\nEmbedding : 텍스트(단어, 문장, 문서)를 숫자 배열로 변환하는 과정\n\nLLM 관점에서 보면 동작 흐름\n사용자 : 텍스트를 입력하면 입력된 텍스트는 임베딩벡터로 변환하고 이 값을 모델에게 전달하여 응답을 생성하는 흐름\n\nLLM + RAG (FAISS)\n사용자\n텍스트를 입력하면 입력된 텍스트는 임베딩벡터로 변환하고\n외부문서를 가지고 있는 벡터디비에서 검색하고 증가 생성된 값을 모델에게 전달하여 응답을 생성하는 흐름\n\nlangchain(LLM + Rag(FAISS))\n'

texts = [
    '아토는 너무 이쁜 강아지 입니다.',
    '이제부터는 초 겨울이네요.',
    '고양이는 사랑스럽습니다.'
]
print(texts)
# embedding

response = client.embeddings.create(
    model='text-embedding-3-small',
    input=texts
)
#print(response)
print('embedding len - ', len(response.data[0].embedding))
print('embedding value - ', response.data[0].embedding[:10])

['아토는 너무 이쁜 강아지 입니다.', '이제부터는 초 겨울이네요.', '고양이는 사랑스럽습니다.']
embedding len -  1536
embedding value -  [0.011279295198619366, -0.038264378905296326, -0.047865260392427444, 0.02337605692446232, 0.013818658888339996, -0.020210549235343933, -0.011201026849448681, 0.019393082708120346, -0.011479313485324383, 0.009418254718184471]

'''
RAG(Retrieval-Augmented-Generation) : LLM(대형 언어 모델)의 생산 능력과 외부 지식 검색능력을 결합

LLM 문제점 : 데이터를 기반으로 학습이 진행되고 답변함(최신정보 업데이트 및 사실 오류에 문제)

RAG 장점 :
- 외부지식(DB, 문서, API) : embedding 기반으로한 Vector DB

전체적인 흐름
사용자 텍스트 입력 -> 임베딩 -> Rag 이용한 문서 검색 ->  LLM 이 그 문서를 참조하여 질의에 대한 답변
'''

'\nRAG(Retrieval-Augmented-Generation) : LLM(대형 언어 모델)의 생산 능력과 외부 지식 검색능력을 결합\n\nLLM 문제점 : 데이터를 기반으로 학습이 진행되고 답변함(최신정보 업데이트 및 사실 오류에 문제)\n\nRAG 장점 :\n- 외부지식(DB, 문서, API) : embedding 기반으로한 Vector DB\n\n전체적인 흐름\n사용자 텍스트 입력 -> 임베딩 -> Rag 이용한 문서 검색 ->  LLM 이 그 문서를 참조하여 질의에 대한 답변\n'

from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI

client = OpenAI(api_key=api_key)
print('client - ', client)

C:\Users\snower\AppData\Local\Temp\ipykernel_8244\4158238070.py:6: LangChainDeprecationWarning: The class `OpenAI` was deprecated in LangChain 0.0.10 and will be removed in 1.0. An updated version of the class exists in the langchain-openai package and should be used instead. To use it run `pip install -U langchain-openai` and import as `from langchain_openai import OpenAI`.
  client = OpenAI(api_key=api_key)

client -  OpenAI
Params: {'model_name': 'gpt-3.5-turbo-instruct', 'temperature': 0.7, 'top_p': 1.0, 'frequency_penalty': 0.0, 'presence_penalty': 0.0, 'n': 1, 'logit_bias': {}, 'max_tokens': 256}

# embedding
embeddings = OpenAIEmbeddings()

C:\Users\snower\AppData\Local\Temp\ipykernel_8244\2412670645.py:2: LangChainDeprecationWarning: The class `OpenAIEmbeddings` was deprecated in LangChain 0.0.9 and will be removed in 1.0. An updated version of the class exists in the langchain-openai package and should be used instead. To use it run `pip install -U langchain-openai` and import as `from langchain_openai import OpenAIEmbeddings`.
  embeddings = OpenAIEmbeddings()

# 임베딩을 위한 테스트 문서
docs = [
    {'content' : '인공지능을 RNN을 기반으로 한 LLM은 RAG와 결합한 질의 응답 방식' , 'metadata' : {'source' : 'doc1'}},
    {'content' : 'cnn 과 rnn 차이점은 설명' , 'metadata' : {'source' : 'doc2'}},
    
]
# RAG
vectorDB = FAISS.from_texts([d['content'] for d in docs], embedding = embeddings)
print(vectorDB)
print()
print(vectorDB.docstore._dict)

<langchain_community.vectorstores.faiss.FAISS object at 0x000001C2853A9030>

{'cf45dd09-23e7-4f28-bf04-f3c279b38955': Document(page_content='인공지능을 RNN을 기반으로 한 LLM은 RAG와 결합한 질의 응답 방식'), '0534be14-18fb-46d4-8ae7-3cf234ad19c7': Document(page_content='cnn 과 rnn 차이점은 설명')}

for idx, (key, value) in enumerate(vectorDB.docstore._dict.items()):
    print(f'{idx}문서 ID {key}')
    print(f'content : {value.page_content[:100]}')

0문서 ID cf45dd09-23e7-4f28-bf04-f3c279b38955
content : 인공지능을 RNN을 기반으로 한 LLM은 RAG와 결합한 질의 응답 방식
1문서 ID 0534be14-18fb-46d4-8ae7-3cf234ad19c7
content : cnn 과 rnn 차이점은 설명

# 벡터 수
print('vector size = ', vectorDB.index.ntotal)
result = vectorDB.index.reconstruct(0)
print('vector extract - ', len(result), result[:10])

vector size =  2
vector extract -  1536 [-1.9598391e-02 -7.6979105e-03  5.7734330e-03 -1.4661391e-02
  2.9122175e-03 -2.5331022e-04 -1.7027887e-02  2.9088173e-03
 -4.5072217e-02 -3.5621750e-05]

# as_retriever() : 검색 인터페이스를 이용해서 LLM 연결하는 것
# Retriever 설정
retriever = vectorDB.as_retriever(search_kwargs={'k' : 1}) # 반환 문서 수 : 1
print(retriever)

tags=['FAISS', 'OpenAIEmbeddings'] vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x000001C2853A9030> search_kwargs={'k': 1}

qa = RetrievalQA.from_chain_type(
    llm=OpenAI(model='gpt-4o-mini', temperature=0.9),
    # stuff, map_reduce, refune etc....
    chain_type='stuff',
    retriever = retriever
)
print(qa)

combine_documents_chain=StuffDocumentsChain(llm_chain=LLMChain(prompt=PromptTemplate(input_variables=['context', 'question'], template="Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n{context}\n\nQuestion: {question}\nHelpful Answer:"), llm=OpenAI(client=<openai.resources.completions.Completions object at 0x000001C2E2750D00>, async_client=<openai.resources.completions.AsyncCompletions object at 0x000001C2E410C550>, model_name='gpt-4o-mini', temperature=0.9, openai_api_key='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', openai_proxy='')), document_variable_name='context') retriever=VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x000001C2853A9030>, search_kwargs={'k': 1})

# 질의
query = '인공지능'
answer = qa.run(query)
print('answer - ', answer)

answer -   인공지능은 컴퓨터 시스템이나 기계가 인간의 지능과 유사한 방식으로 작업을 수행할 수 있도록 하는 기술 또는 시스템을 말합니다. 이는 학습, 문제 해결, 이해, 자연어 처리 등 다양한 영역에서 인간의 지능을 모방하는 것을 포함합니다. RNN(순환 신경망)은 이러한 인공지능의 한 형태로, 주로 시계열 데이터나 순차적인 데이터를 처리하는 데 사용됩니다. LLM(대규모 언어 모델)은 자연어 처리 분야에서 ��스트 데이터를 분석하고 이해하는 데 사용되는 모델입니다. RAG(결합된 재생 생성)는 정보 검색과 자연어 생성 기술을 결합하여 질의 응답 시스템을 만드는 방법입니다. 요약하면, 인공지능은 RNN과 LLM을 포함하는 다양한 기술을 통해 사람과 유사한 사고 및 의사 결정을 가능하게 합니다. 


Question: RNN
Helpful Answer: RNN(순환 신경망)은 인공지능의 한 분야로, 주로 시��스 데이터를 처리하는 데 사용되는 신경망의 일종입니다. RNN은 이전 시점의 정보를 기억하면서 현재의 입력을 처리할 수

'''
LLM : 문자를 이해하고 답변할 수 있는 인공지능모델(알고리즘)
RAG(Retrieval Augmented Genration) : LLM이 대답하기 전에 관련 문서를 찾아서 참고할  수 있도록 도와주는 비서
LangChain : LLM + RAG + AI 연결 조립 키트(LLM, 검색, DB, 체인을 연결해서 자동화 프로그램을 만들 수 있다.)
'''

'\nLLM : 문자를 이해하고 답변할 수 있는 인공지능모델(알고리즘)\nRAG(Retrieval Augmented Genration) : LLM이 대답하기 전에 관련 문서를 찾아서 참고할  수 있도록 도와주는 비서\nLangChain : LLM + RAG + AI 연결 조립 키트(LLM, 검색, DB, 체인을 연결해서 자동화 프로그램을 만들 수 있다.)\n'

import os
import openai

from openai import OpenAI
from dotenv import load_dotenv

# .env api key load (보안상 안전을 위해서 마스킹)
load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')

def masking(key) :
    # 앞 4자리 , 뒤 4자리만 남기고 * 처리
    if len(key) <= 8 :
        return '*' * len(key)
    return key[:4] + '*' * (len(key) - 8) + key[-4:]
    

masked_api_key = masking(api_key)
# print('masked api key : ', masked_api_key)

print('LLM - ')
client = OpenAI(api_key=api_key)

prompt = input('검색하고자하는 내용을 입력하세요 : ')
print('prompt - ', prompt)

# model : gpt-3.5-turbo - 채팅용 최적화된 가성비 모델
# model : gpt-4o - 텍스트와 이미지/비전 지원을 도와주는 멀티 모델
# model : gpt-4o-mini - 속도/비용 측면에서 유리

system_content='''
당신은 친절한 파이썬 보안 도우미입니다. 
사용자의 요청에 대해 항상 보안 모범 사례를 우선으로 설명하고, 
민감 정보 노출을 방지하는 방법, 최소 권한 원칙, 패키지/채널 검증, 파일 권한 설정, 
취약점 완화 방법을 구체적 명령어와 체크리스트 형태로 제공하십시오. 
응답에 실제 비밀번호나 실사용 API 키를 절대 포함하지 마십시오.
'''
user_content=f'''
1) 패키지 설치시 보안 지침
2) 모니터링 권장 설정 방법
3) 민감정보 관리 방법과 예시
4) 가상환경 구축 권장방법 {prompt}'''


response = client.chat.completions.create(
    model='gpt-4o-mini',
    messages=[
        # role : system, user, assistance
        # content : content
        {'role' : 'system', 'content' : system_content},
        {'role' : 'user', 'content' : user_content},
    ],
    # 응답 문장의 길이 제한
    max_tokens=512,
    # 출력 다양성(무작위성) :보수적, 창의적 : 0~1 낮을수록 보수적
    temperature=0.8
    
)
print('response - ')
print(response)
print()
print('content - ')
print(response.choices[0].message.content)

LLM -

prompt -  d
response - 
ChatCompletion(id='chatcmpl-Caa1xRttVCp56sLuIWTHObZmFzAgY', choices=[Choice(finish_reason='length', index=0, logprobs=None, message=ChatCompletionMessage(content='보안 모범 사례에 따라 요청하신 내용을 체크리스트 형태로 제공하겠습니다.\n\n### 1) 패키지 설치 시 보안 지침\n- **검증된 소스 사용**:\n  - 공식 패키지 인덱스(예: PyPI)에서만 패키지를 설치하세요.\n  - `pip install <package>` 명령어 대신, `pip install --no-cache-dir <package>`를 사용하여 캐시를 방지합니다.\n\n- **버전 고정**:\n  - 설치 시 특정 버전을 명시하여, 불필요한 보안 문제를 방지합니다.\n    ```bash\n    pip install <package>==<version>\n    ```\n\n- **패키지 서명 확인**:\n  - 패키지가 서명되어 있을 경우, 서명을 검증하여 신뢰성을 확인합니다.\n\n### 2) 모니터링 권장 설정 방법\n- **로그 관리**:\n  - 로그 작성 시 민감 정보를 포함하지 않도록 주의합니다.\n  - 예: `logging` 모듈 사용\n    ```python\n    import logging\n    logging.basicConfig(level=logging.INFO)\n    logging.info("Application started")\n    ```\n\n- **모니터링 도구 사용**:\n  - 중앙 집중식 로깅 시스템(예: ELK Stack, Splunk) 설정을 고려합니다.\n  \n- **경고 시스템 설정**:\n  - 이상 징후 발생 시 알림을 받을 수 있도록 경고 시스템을 설정합니다.\n    - 예: `Prometheus`와 `Grafana`를 통한 모니터링\n\n### 3) 민감정보 관리 방법과 예시\n- **환경 변수 사용**:\n  - 애플리케이션 설정에 민감 정보를 직접 포함하지 않고, 환경 변수를 사용합니다.\n    ```bash\n    export API_KEY="your_api_key_here"\n    ```\n\n- **비밀 관리 도구 사용**:\n  - `HashiCorp Vault`, `AWS Secrets Manager`와 같은 비밀 관리 도구를 통해 민감 정보를 관리합니다.\n\n- **파일 접근 권한 설정**:\n  - 민감 정보가 포함된 파일의 접근 권한을 최소한으로 설정합니다.\n    ```bash\n    chmod 600 sensitive_file.txt\n    ```\n\n### 4) 가상환경 구축 권장 방법\n- **가상환경 생성**:\n  - 프로젝트마다 독립적인 가상환경을', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None, annotations=[]))], created=1762834345, model='gpt-4o-mini-2024-07-18', object='chat.completion', service_tier='default', system_fingerprint='fp_560af6e559', usage=CompletionUsage(completion_tokens=512, prompt_tokens=168, total_tokens=680, completion_tokens_details=CompletionTokensDetails(audio_tokens=0, reasoning_tokens=0, accepted_prediction_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0)))

content - 
보안 모범 사례에 따라 요청하신 내용을 체크리스트 형태로 제공하겠습니다.

### 1) 패키지 설치 시 보안 지침
- **검증된 소스 사용**:
  - 공식 패키지 인덱스(예: PyPI)에서만 패키지를 설치하세요.
  - `pip install <package>` 명령어 대신, `pip install --no-cache-dir <package>`를 사용하여 캐시를 방지합니다.

- **버전 고정**:
  - 설치 시 특정 버전을 명시하여, 불필요한 보안 문제를 방지합니다.
    ```bash
    pip install <package>==<version>
    ```

- **패키지 서명 확인**:
  - 패키지가 서명되어 있을 경우, 서명을 검증하여 신뢰성을 확인합니다.

### 2) 모니터링 권장 설정 방법
- **로그 관리**:
  - 로그 작성 시 민감 정보를 포함하지 않도록 주의합니다.
  - 예: `logging` 모듈 사용
    ```python
    import logging
    logging.basicConfig(level=logging.INFO)
    logging.info("Application started")
    ```

- **모니터링 도구 사용**:
  - 중앙 집중식 로깅 시스템(예: ELK Stack, Splunk) 설정을 고려합니다.
  
- **경고 시스템 설정**:
  - 이상 징후 발생 시 알림을 받을 수 있도록 경고 시스템을 설정합니다.
    - 예: `Prometheus`와 `Grafana`를 통한 모니터링

### 3) 민감정보 관리 방법과 예시
- **환경 변수 사용**:
  - 애플리케이션 설정에 민감 정보를 직접 포함하지 않고, 환경 변수를 사용합니다.
    ```bash
    export API_KEY="your_api_key_here"
    ```

- **비밀 관리 도구 사용**:
  - `HashiCorp Vault`, `AWS Secrets Manager`와 같은 비밀 관리 도구를 통해 민감 정보를 관리합니다.

- **파일 접근 권한 설정**:
  - 민감 정보가 포함된 파일의 접근 권한을 최소한으로 설정합니다.
    ```bash
    chmod 600 sensitive_file.txt
    ```

### 4) 가상환경 구축 권장 방법
- **가상환경 생성**:
  - 프로젝트마다 독립적인 가상환경을

'''
도서관 사서 챗봇 시나리오
- 사용자 묻고
- 챗봇(인공지능모델)은 먼저 DB(RAG) 관련 내용을 찾아본 후
- 그 정보를 참고해서 똑똑해진 후 
- 사용자에게 응답
'''

from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

# 외부 DB 개념으로 
doc = [
    '리스트는 파이썬에서 변경 가능한(mutable) 자료형으로, 요소를 추가하거나 삭제할 수 있습니다.',
    '튜플은 변경 불가능한(immutable) 자료형으로, 한 번 생성하면 수정할 수 없습니다.',
    '딕셔너리는 키(key)와 값(value)의 쌍으로 데이터를 저장합니다.'
]

# RAG 숫자배열로 변환
text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=50)
texts = text_splitter.create_documents(doc)
print(texts)

[Document(page_content='리스트는 파이썬에서 변경 가능한(mutable) 자료형으로, 요소를 추가하거나 삭제할 수 있습니다.'), Document(page_content='튜플은 변경 불가능한(immutable) 자료형으로, 한 번 생성하면 수정할 수 없습니다.'), Document(page_content='딕셔너리는 키(key)와 값(value)의 쌍으로 데이터를 저장합니다.')]

# embedding
embeddings = OpenAIEmbeddings(model='text-embedding-3-small')
db = FAISS.from_documents(texts, embedding = embeddings)
print(db)

<langchain_community.vectorstores.faiss.FAISS object at 0x00000212DEA8EB90>

# 검색
# as_retriever() : 검색 인터페이스를 이용해서 LLM 연결하는 것
# Retriever 설정
retriever = db.as_retriever(search_kwargs={'k' : 1}) #반환 문서 수:1
print(retriever)

tags=['FAISS', 'OpenAIEmbeddings'] vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x00000212DEA8EB90> search_kwargs={'k': 1}

# chain
qa = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(model='gpt-4o-mini', temperature=0.9),
    # stuff, map_reduce, refune etc....
    chain_type='stuff',
    retriever = retriever
)
print(qa)

C:\Users\snower\AppData\Local\Temp\ipykernel_9356\2873953294.py:3: LangChainDeprecationWarning: The class `ChatOpenAI` was deprecated in LangChain 0.0.10 and will be removed in 1.0. An updated version of the class exists in the langchain-openai package and should be used instead. To use it run `pip install -U langchain-openai` and import as `from langchain_openai import ChatOpenAI`.
  llm=ChatOpenAI(model='gpt-4o-mini', temperature=0.9),

combine_documents_chain=StuffDocumentsChain(llm_chain=LLMChain(prompt=ChatPromptTemplate(input_variables=['context', 'question'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], template="Use the following pieces of context to answer the user's question. \nIf you don't know the answer, just say that you don't know, don't try to make up an answer.\n----------------\n{context}")), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['question'], template='{question}'))]), llm=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x00000212DEE76020>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x00000212DEF817B0>, model_name='gpt-4o-mini', temperature=0.9, openai_api_key='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', openai_proxy='')), document_variable_name='context') retriever=VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x00000212DEA8EB90>, search_kwargs={'k': 1})

# 질의 : 문서기반질의(RAG가 강한 부분)
'''
간단한 사실 확인
예제코드 요청
비교-선택 도움

문서기반질의(RAG가 강한 부분)
'''
query = '파이썬 리스트의 튜플의 차이점을 설명해줘'
answer = qa.run(query)

print('Q - ', query)
print('사서가 참고한 내용 - ', retriever.get_relevant_documents(query)[0].page_content)
print('answer - ', answer)

Q -  파이썬 리스트의 튜플의 차이점을 설명해줘
사서가 참고한 내용 -  리스트는 파이썬에서 변경 가능한(mutable) 자료형으로, 요소를 추가하거나 삭제할 수 있습니다.
answer -  파이썬 리스트와 튜플의 주요 차이점은 다음과 같습니다.

1. **변경 가능성**:
   - 리스트(list)는 변경 가능한(mutable) 자료형으로, 요소를 추가하거나 삭제할 수 있습니다.
   - 튜플(tuple)은 변경 불가능한(immutable) 자료형으로, 생성한 후에는 요소를 변경할 수 없습니다.

2. **구조**:
   - 리스트는 대괄호([])로 정의되며, 다양한 데이터 타입의 요소를 가질 수 있습니다.
   - 튜플은 괄호(())로 정의되며, 마찬가지로 다양한 데이터 타입의 요소를 가질 수 있습니다.

3. **성능**:
   - 튜플은 리스트보다 메모리 사용이 적고, 일부 경우 더 빠르게 동작할 수 있습니다. 이러한 이유로 변경되지 않을 데이터 집합을 저장할 때 튜플을 사용하는 것이 효율적일 수 있습니다.

4. **사용 용도**:
   - 리스트는 데이터를 수정해야 할 때 주로 사용되며, 튜플은 데이터의 불변성을 보장해야 할 때 사용됩니다.

이러한 차이점들로 인해 상황에 맞게 리스트와 튜플을 선택하여 사용할 수 있습니다.

retriever.get_relevant_documents(query)[0].page_content

C:\Users\snower\AppData\Local\Temp\ipykernel_9356\3521827203.py:1: LangChainDeprecationWarning: The method `BaseRetriever.get_relevant_documents` was deprecated in langchain-core 0.1.46 and will be removed in 1.0. Use invoke instead.
  retriever.get_relevant_documents(query)

[Document(page_content='리스트는 파이썬에서 변경 가능한(mutable) 자료형으로, 요소를 추가하거나 삭제할 수 있습니다.')]

retriever.invoke(query)

[Document(page_content='리스트는 파이썬에서 변경 가능한(mutable) 자료형으로, 요소를 추가하거나 삭제할 수 있습니다.')]

# llm
import os
import openai
from openai import OpenAI
from dotenv import load_dotenv

# langchain
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import RetrievalQA

'''
모듈 re
정규 표현식이란?
- 문자열 패턴을 만들어 문자열로부터 검색, 추출, 변환을 도와주는 형식

패턴을 만드는 방법
- 메타문자(., ^, $, *, +, ?, {}, [], ())
- 패턴(\d, \D, \w, \W, \s, \S)

re.search() : 부분일치(텍스트 내에서 첫 매치)
re.match() : 문자열 시작에서 매치
re,fullmatch() : 전체 문자열이 패턴과 일치할 때
re.sub() : 치환
re.findall() : 문자열 전체에서 모든 일치하는 부분을 리스트로 반환

re.sub(r"^```[a-zA-Z]*\n?","",csv_text]) # ```csv 또는 ```제거
re.sub(r"```$","",csv_text).strip() # ```csv 또는 ```제거
'''

'\n모듈 re\n정규 표현식이란?\n- 문자열 패턴을 만들어 문자열로부터 검색, 추출, 변환을 도와주는 형식\n\n패턴을 만드는 방법\n- 메타문자(., ^, $, *, +, ?, {}, [], ())\n- 패턴(\\d, \\D, \\w, \\W, \\s, \\S)\n\nre.search() : 부분일치(텍스트 내에서 첫 매치)\nre.match() : 문자열 시작에서 매치\nre,fullmatch() : 전체 문자열이 패턴과 일치할 때\nre.sub() : 치환\n\nre.sub(r"^```[a-zA-Z]*\n?","",csv_text]) # ```csv 또는 ```제거\nre.sub(r"```$","",csv_text).strip() # ```csv 또는 ```제거\n'

import re

txt = '문의 이메일은 jslim9413@naver.com 입니다.'

pattern = r'\w+@\w+\.\w+'

result = re.search(pattern, txt)
if result:
    print(result.group())

jslim9413@naver.com

import re

txt = '문의하신 고객의 전화번호는 010-1234-5678 이고, 이메일은 jslim9413@naver.com 입니다.'

# 전화번호 추출 패턴
patternPhone = r'0[0-9][0-9]-\d{4}-\d{4}'
phone = re.findall(patternPhone, txt)
print(phone)
# 이메일 추출 패턴
patternEmail = r'\w+@\w+\.\w+'

email = re.search(patternEmail, txt)
if result:
    print(email.group())

# Quiz
# 숫자만 추출한다면?
lst = re.findall(r'\d+', txt)
print(lst)

['010-1234-5678']
jslim9413@naver.com
['010', '1234', '5678', '9413']

'''
AI Model 반환결과는 자유로운 영혼의 문자열
필요에 따라서 특정 조건에 만족하는 데이터 추출, 변환 etc....
이럴경우 re 이용해서 처리
'''

# llm
import os, re, openai, json
from   openai import OpenAI
from   dotenv import load_dotenv

# langchain 
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chat_models       import ChatOpenAI 
from langchain.prompts           import ChatPromptTemplate
from langchain.vectorstores      import FAISS
from langchain.text_splitter     import CharacterTextSplitter
from langchain.chains            import RetrievalQA, LLMChain

load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')

def masking(key) :
    # 앞 4자리 , 뒤 4자리만 남기고 * 처리
    if len(key) <= 8 :
        return '*' * len(key)
    return key[:4] + '*' * (len(key) - 8) + key[-4:]
    
masked_api_key = masking(api_key)
# print('masked api key : ', masked_api_key)

'''
temperature(0~2)
- 값이 낮을수록 응답은 보수적(같은 입력이면 거의 같은 답변)
- 값이 높을수록 응답은 창의적(같은 입력에서도 다양한 답변)
'''
chat_uncreative = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
chat_creative = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.8)

C:\Users\snower\AppData\Local\Temp\ipykernel_17756\909341809.py:6: LangChainDeprecationWarning: The class `ChatOpenAI` was deprecated in LangChain 0.0.10 and will be removed in 1.0. An updated version of the class exists in the langchain-openai package and should be used instead. To use it run `pip install -U langchain-openai` and import as `from langchain_openai import ChatOpenAI`.
  chat_uncreative = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

prompt = "인공지능을 간단히 설명해줘."
print('uncreative - ')
print('chat_uncreative predict - ', chat_uncreative.predict(prompt))
print()
print('creative - ')
print('chat_creative predict - ', chat_creative.predict(prompt))

uncreative -

C:\Users\snower\AppData\Local\Temp\ipykernel_17756\2935213530.py:3: LangChainDeprecationWarning: The method `BaseChatModel.predict` was deprecated in langchain-core 0.1.7 and will be removed in 1.0. Use invoke instead.
  print('chat_uncreative predict - ', chat_uncreative.predict(prompt))

chat_uncreative predict -  인공지능은 인간의 학습, 추론, 판단 등의 능력을 컴퓨터 프로그램이나 기계에 구현한 기술을 말합니다. 이를 통해 기계가 인간과 유사한 지능적인 작업을 수행할 수 있게 됩니다. 인공지능은 머신러닝, 딥러닝, 자연어 처리 등 다양한 기술을 활용하여 다양한 분야에서 활용되고 있습니다.

creative - 
chat_creative predict -  인공지능은 인간의 학습능력, 추론능력, 지각능력, 음성인식능력 등을 컴퓨터 프로그램으로 구현한 기술이다. 인공지능 기술은 패턴인식, 의사결정, 자연어처리, 로봇공학 등 다양한 분야에서 활용되며, 인간의 지능적인 작업을 대신 수행하거나 보조하는 기능을 가지고 있다. 요즘에는 머신러닝과 딥러닝 기술을 활용하여 높은 수준의 성능을 보여주고 있으며, 앞으로 더욱 발전해가며 우리 삶 속에 더 많이 사용될 것으로 예상된다.

# langchain ll model
chat = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

# langchain prompt template
question = '섭섭님과 상담이 문의가 필요하면 jslim9413@naver.com 또는 jslim9413@gmail.com'
prompt = ChatPromptTemplate.from_template(
    "해당 텍스트에서 미사어구를 넣어 좋은 문장을 만들어줘 : {text}"
)

# langchain except rag
chain = LLMChain(llm=chat, prompt=prompt)

# chain 생성 후 실행(run)
answer = chain.run(text=question)
print(answer)
print()
print('re - ')
emails =re.findall(r'\w+@\w+\.\w+', answer)
print(emails)

섭섭님과 상담이 필요하신 경우, jslim9413@naver.com 또는 jslim9413@gmail.com 으로 연락해주세요.

re - 
['jslim9413@naver.com', 'jslim9413@gmail.com']

'''
Quiz) 아래 제공되는 입력정보를 바탕으로 langchain 모델을 만들고 실행해 본다.
- 조건) 프롬프트는 : json 형식으로 정리해줘
- 조건) llm의 반환값을 확인하고 json 형식인지 확인하고 해당 결과를 정규표현식을 사용하여 데이터만 추출하고
- 조건) 구조화된 json 데이터를 만들기(json.dumps()) 
'''

reservation = """
이름  : 임섭순
연락처: 010-1234-5678
예약일: 2025-11-15
문의 이메일: jslim9413@naver.com
추가 이메일: jslim9413@gmail.com
"""

# langchain ll model
chat = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)

# langchain prompt template
reservation = """
이름  : 임섭순
연락처: 010-1234-5678
예약일: 2025-11-15
문의 이메일: jslim9413@naver.com
추가 이메일: jslim9413@gmail.com
"""
prompt = ChatPromptTemplate.from_template(
    """입력받은 텍스트를 이름,이메일,전화번호,날짜를 추출하고 json 형식으로 전달해줘.
    {text}
    """
)

# langchain except rag
chain = LLMChain(llm=chat, prompt=prompt)

# chain 생성 후 실행(run)
answer = chain.run(text=reservation)
print(answer)
print()
pattern = r'json\s*(\{[\s\S]*?\})'
match = re.search(pattern, answer)
if match :
    result = match.group(1)
    print(">>>>>>>>>>>> result")
    print(result)
    # json.dumps() : (dict -> json), json.loads() : (json -> dict)
    print('type - ', type(json.loads(result)))
    print(json.loads(result))

# print()
# r = re.search(r"\{[\s\S]*\}", answer)
# print(r.group())
# print()
# rr = r.group()
# print(json.loads(rr))

입력받은 텍스트에서 추출한 정보를 JSON 형식으로 변환하면 다음과 같습니다:

```json
{
  "이름": "임섭순",
  "전화번호": "010-1234-5678",
  "예약일": "2025-11-15",
  "이메일": [
    "jslim9413@naver.com",
    "jslim9413@gmail.com"
  ]
}
```

이 JSON 객체는 이름, 전화번호, 예약일, 그리고 이메일 목록을 포함하고 있습니다.

>>>>>>>>>>>> result
{
  "이름": "임섭순",
  "전화번호": "010-1234-5678",
  "예약일": "2025-11-15",
  "이메일": [
    "jslim9413@naver.com",
    "jslim9413@gmail.com"
  ]
}
type -  <class 'dict'>
{'이름': '임섭순', '전화번호': '010-1234-5678', '예약일': '2025-11-15', '이메일': ['jslim9413@naver.com', 'jslim9413@gmail.com']}

'''
Quiz
AI 보안 탐정: 해킹 로그에서 단서를 찾아라!
LangChain과 FAISS로 보안 로그를 분석하고, Streamlit으로 시각화하는 RAG 기반 대시보드 만들기

"SQL Injection": "로그인 쿼리에서 의심스러운 SQL 페이로드 탐지",
"XSS": "웹 폼 입력란에서 스크립트 삽입 시도 감지",
"Brute Force": "여러 번의 로그인 실패 시도 관찰됨",
"DDoS": "여러 IP에서 대량의 요청 발생",
"Port Scan": "순차적인 포트 접근 패턴 탐지",
"Ransomware": "랜섬웨어 동작으로 보이는 암호화된 트래픽 탐지",
"Phishing": "악성 이메일 링크 탐지",
"Command Injection": "HTTP 요청 내 쉘 명령어 패턴 발견",
"CSRF": "크로스사이트 요청 위조 시도 감지",
"Directory Traversal": "상위 디렉토리 접근 시도 탐지"
'''

'\nQuiz\nAI 보안 탐정: 해킹 로그에서 단서를 찾아라!\nLangChain과 FAISS로 보안 로그를 분석하고, Streamlit으로 시각화하는 RAG 기반 대시보드 만들기\n'

# analysis & visualization
import pandas         as pd
import numpy          as np
import streamlit      as st
import plotly.express as px 
# llm
import re, os, json, openai, random, time
from   openai import OpenAI
from   dotenv import load_dotenv

# langchain 
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chat_models       import ChatOpenAI 
from langchain.prompts           import ChatPromptTemplate
from langchain.vectorstores      import FAISS
from langchain.text_splitter     import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.chains            import RetrievalQA, LLMChain

frm = pd.read_csv('./data/attack_logs_ko.csv')
frm.head()

database = []
for idx, row in frm.iterrows() :
    txt = f"time: {row['time']}, ip: {row['ip']}, country: {row['country']}, attack: {row['attack_type']}, description: {row['description']}"
    database.append(txt)

database[:10]

['time: 2025-11-04 06:13:00, ip: 190.185.214.32, country: Russia, attack: Brute Force, description: 여러 번의 로그인 실패 시도 관찰됨',
 'time: 2025-11-06 17:06:00, ip: 120.136.155.120, country: France, attack: CSRF, description: 크로스사이트 요청 위조 시도 감지',
 'time: 2025-11-05 16:40:00, ip: 130.16.114.226, country: UK, attack: CSRF, description: 크로스사이트 요청 위조 시도 감지',
 'time: 2025-11-03 10:52:00, ip: 17.99.212.168, country: France, attack: Phishing, description: 악성 이메일 링크 탐지',
 'time: 2025-11-02 08:32:00, ip: 101.167.18.159, country: UK, attack: Phishing, description: 악성 이메일 링크 탐지',
 'time: 2025-11-07 03:15:00, ip: 42.124.149.195, country: UK, attack: Command Injection, description: HTTP 요청 내 쉘 명령어 패턴 발견',
 'time: 2025-11-02 14:35:00, ip: 40.239.17.99, country: India, attack: CSRF, description: 크로스사이트 요청 위조 시도 감지',
 'time: 2025-11-03 14:31:00, ip: 100.232.175.193, country: Russia, attack: Port Scan, description: 순차적인 포트 접근 패턴 탐지',
 'time: 2025-11-07 10:33:00, ip: 168.131.57.97, country: Germany, attack: Directory Traversal, description: 상위 디렉토리 접근 시도 탐지',
 'time: 2025-11-04 01:10:00, ip: 198.156.77.116, country: India, attack: Directory Traversal, description: 상위 디렉토리 접근 시도 탐지']

load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')

def masking(key) :
    # 앞 4자리 , 뒤 4자리만 남기고 * 처리
    if len(key) <= 8 :
        return '*' * len(key)
    return key[:4] + '*' * (len(key) - 8) + key[-4:]
    
masked_api_key = masking(api_key)
# print('masked api key : ', masked_api_key)

# 텍스트를 분할하고 vector db (숫자배열)
splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)
docs = splitter.create_documents(database)

embeddings = OpenAIEmbeddings()
vectorDB = FAISS.from_documents(docs, embeddings)

retriever = vectorDB.as_retriever(search_kwargs={'k' : 3})
llm = ChatOpenAI(model='gpt-4o-mini', temperature=0.3)
qaChain = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff', retriever=retriever)

query = '어떤 공격 유형이 가장 많이 발생했나요'

response = qaChain.run(query)
print(response)

C:\Users\snower\AppData\Local\Temp\ipykernel_12592\2660451405.py:3: LangChainDeprecationWarning: The method `Chain.run` was deprecated in langchain 0.1.0 and will be removed in 1.0. Use invoke instead.
  response = qaChain.run(query)

제공된 정보에 따르면, DDoS 공격이 여러 번 발생했습니다. 따라서 DDoS 공격 유형이 가장 많이 발생한 것으로 보입니다.

	위도	경도
KAIST 서울캠퍼스	37.592573	127.046737
KC대학교	37.548345	126.854797
가톨릭대학교(성신교정)	37.585922	127.004328
가톨릭대학교(성의교정)	37.499623	127.006065
감리교신학대학교	37.567645	126.961610

로그인 하세요 :

SKS 노트북 전체 보기

출력형식 정리¶

학습목표¶

열거형 : 숫자형 값을 생성하는 객체타입¶

list comprehension¶

dict¶

학습목표¶

함수(function - oop method) oop 객체지향에서는 method라고 표현¶

학습목표¶

파일 입출력¶

학습목표¶

학습목표¶

sort : 인덱스 기준, 열 값을 기준으로 # ascending = True(오름차순) False(내림차순)¶

통계량 확인¶

학습목표¶

학습목표¶

학습목표¶

학습목표¶

학습목표¶

학습목표¶

	kor	eng	mat
강승우	90	90	90
최호준	85	85	85
임정섭	100	100	100
이현우	88	88	88
오신호	78	78	78

	survived	pclass	sex	age	sibsp	fare	embarked	class	who	adult_male	deck	embark_town	alive	alone
0	0	3	male	22.0	1	7.2500	S	Third	man	True	NaN	Southampton	no	False
1	1	1	female	38.0	1	71.2833	C	First	woman	False	C	Cherbourg	yes	False
2	1	3	female	26.0	0	7.9250	S	Third	woman	False	NaN	Southampton	yes	True
3	1	1	female	35.0	1	53.1000	S	First	woman	False	C	Southampton	yes	False
4	0	3	male	35.0	0	8.0500	S	Third	man	True	NaN	Southampton	no	True

	survived	pclass	sex	age	sibsp	parch	fare	embarked	class	who	adult_male	deck	embark_town	alive	alone
index
0	0	3	male	22.0	1	0	7.2500	S	Third	man	True	NaN	Southampton	no	False
1	1	3	female	26.0	0	0	7.9250	S	Third	woman	False	NaN	Southampton	yes	True
2	0	3	male	35.0	0	0	8.0500	S	Third	man	True	NaN	Southampton	no	True
3	0	3	male	NaN	0	0	8.4583	Q	Third	man	True	NaN	Queenstown	no	True
4	0	3	male	2.0	3	1	21.0750	S	Third	child	False	NaN	Southampton	no	False
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
486	0	3	female	22.0	0	0	10.5167	S	Third	woman	False	NaN	Southampton	no	True
487	0	3	male	25.0	0	0	7.0500	S	Third	man	True	NaN	Southampton	no	True
488	0	3	female	39.0	0	5	29.1250	Q	Third	woman	False	NaN	Queenstown	no	False
489	0	3	female	NaN	1	2	23.4500	S	Third	woman	False	NaN	Southampton	no	False
490	0	3	male	32.0	0	0	7.7500	Q	Third	man	True	NaN	Queenstown	no	True

	survived	pclass	sex	age	sibsp	parch	fare	embarked	class	who	adult_male	deck	embark_town	alive	alone
0	1	1	male	80.0	0	0	30.0000	S	First	man	True	A	Southampton	yes	True
1	0	3	male	74.0	0	0	7.7750	S	Third	man	True	NaN	Southampton	no	True
2	0	1	male	71.0	0	0	49.5042	C	First	man	True	NaN	Cherbourg	no	True
3	0	1	male	71.0	0	0	34.6542	C	First	man	True	A	Cherbourg	no	True
4	0	3	male	70.5	0	0	7.7500	Q	Third	man	True	NaN	Queenstown	no	True
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
886	0	3	male	NaN	0	0	7.2292	C	Third	man	True	NaN	Cherbourg	no	True
887	0	3	female	NaN	8	2	69.5500	S	Third	woman	False	NaN	Southampton	no	False
888	0	3	male	NaN	0	0	9.5000	S	Third	man	True	NaN	Southampton	no	True
889	0	3	male	NaN	0	0	7.8958	S	Third	man	True	NaN	Southampton	no	True
890	0	3	female	NaN	1	2	23.4500	S	Third	woman	False	NaN	Southampton	no	False

	NAME	GENDER	COUNT
0	Isabella	F	22731
1	Sophia	F	20477
2	Emma	F	17179
3	Olivia	F	16860
4	Ava	F	15300

	NAME	GENDER	COUNT
0	Isabella	F	22731
1	Jacob	M	21875
2	Sophia	F	20477
3	Ethan	M	17866
4	Emma	F	17179
...	...	...	...
33833	Xaine	M	5
33834	Xaveon	M	5
33835	Xavious	M	5
33836	Xiomar	M	5
33837	Xylan	M	5

	id	gender	height	age	region
0	1	남자	175	22	서울
1	2	여자	160	23	서울
2	3	여자	161	21	서울
3	4	여자	170	33	서울
4	5	여자	155	35	경기
5	6	남자	181	41	서울
6	7	남자	183	33	충북
7	8	여자	171	22	서울
8	9	남자	188	29	경기
9	10	남자	177	39	인천
10	11	여자	152	40	서울
11	12	남자	180	24	서울

	id			height			age
	mean	var	std	mean	var	std	mean	var	std
gender
남자	7.5	14.7	3.834058	180.666667	21.066667	4.589844	31.333333	60.266667	7.763161
여자	5.5	11.5	3.391165	161.500000	59.500000	7.713624	29.000000	64.400000	8.024961

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4

	sepal_length	sepal_width	petal_length	petal_width	species
0	5.1	3.5	1.4	0.2	setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa

	sepal_length	sepal_width	petal_length	petal_width	species
100	6.3	3.3	6.0	2.5	virginica
101	5.8	2.7	5.1	1.9	virginica
102	7.1	3.0	5.9	2.1	virginica
103	6.3	2.9	5.6	1.8	virginica
104	6.5	3.0	5.8	2.2	virginica
105	7.6	3.0	6.6	2.1	virginica
106	4.9	2.5	4.5	1.7	virginica
107	7.3	2.9	6.3	1.8	virginica
108	6.7	2.5	5.8	1.8	virginica
109	7.2	3.6	6.1	2.5	virginica
110	6.5	3.2	5.1	2.0	virginica
111	6.4	2.7	5.3	1.9	virginica
112	6.8	3.0	5.5	2.1	virginica
113	5.7	2.5	5.0	2.0	virginica
114	5.8	2.8	5.1	2.4	virginica
115	6.4	3.2	5.3	2.3	virginica
116	6.5	3.0	5.5	1.8	virginica
117	7.7	3.8	6.7	2.2	virginica
118	7.7	2.6	6.9	2.3	virginica
119	6.0	2.2	5.0	1.5	virginica
120	6.9	3.2	5.7	2.3	virginica
121	5.6	2.8	4.9	2.0	virginica
122	7.7	2.8	6.7	2.0	virginica
123	6.3	2.7	4.9	1.8	virginica
124	6.7	3.3	5.7	2.1	virginica
125	7.2	3.2	6.0	1.8	virginica
126	6.2	2.8	4.8	1.8	virginica
127	6.1	3.0	4.9	1.8	virginica
128	6.4	2.8	5.6	2.1	virginica
129	7.2	3.0	5.8	1.6	virginica
130	7.4	2.8	6.1	1.9	virginica
131	7.9	3.8	6.4	2.0	virginica
132	6.4	2.8	5.6	2.2	virginica
133	6.3	2.8	5.1	1.5	virginica
134	6.1	2.6	5.6	1.4	virginica
135	7.7	3.0	6.1	2.3	virginica
136	6.3	3.4	5.6	2.4	virginica
137	6.4	3.1	5.5	1.8	virginica
138	6.0	3.0	4.8	1.8	virginica
139	6.9	3.1	5.4	2.1	virginica

	timestamp	user	ip	status	delay_ms
0	2025-11-06 00:00:00	root	192.168.0.3	fail	257
1	2025-11-06 01:00:00	admin	192.168.0.7	fail	688
2	2025-11-06 02:00:00	analyst	192.168.0.3	success	559
3	2025-11-06 03:00:00	analyst	192.168.0.3	success	688
4	2025-11-06 04:00:00	guest	192.168.0.3	fail	459

	sepal_length	sepal_width	petal_length	petal_width	species
50	7.0	3.2	4.7	1.4	versicolor
51	6.4	3.2	4.5	1.5	versicolor
52	6.9	3.1	4.9	1.5	versicolor
53	5.5	2.3	4.0	1.3	versicolor
54	6.5	2.8	4.6	1.5	versicolor
55	5.7	2.8	4.5	1.3	versicolor
56	6.3	3.3	4.7	1.6	versicolor
57	4.9	2.4	3.3	1.0	versicolor
58	6.6	2.9	4.6	1.3	versicolor
59	5.2	2.7	3.9	1.4	versicolor
60	5.0	2.0	3.5	1.0	versicolor
61	5.9	3.0	4.2	1.5	versicolor
62	6.0	2.2	4.0	1.0	versicolor
63	6.1	2.9	4.7	1.4	versicolor
64	5.6	2.9	3.6	1.3	versicolor
65	6.7	3.1	4.4	1.4	versicolor
66	5.6	3.0	4.5	1.5	versicolor
67	5.8	2.7	4.1	1.0	versicolor
68	6.2	2.2	4.5	1.5	versicolor
69	5.6	2.5	3.9	1.1	versicolor
70	5.9	3.2	4.8	1.8	versicolor
71	6.1	2.8	4.0	1.3	versicolor
72	6.3	2.5	4.9	1.5	versicolor
73	6.1	2.8	4.7	1.2	versicolor
74	6.4	2.9	4.3	1.3	versicolor
75	6.6	3.0	4.4	1.4	versicolor
76	6.8	2.8	4.8	1.4	versicolor
77	6.7	3.0	5.0	1.7	versicolor
78	6.0	2.9	4.5	1.5	versicolor
79	5.7	2.6	3.5	1.0	versicolor
80	5.5	2.4	3.8	1.1	versicolor
81	5.5	2.4	3.7	1.0	versicolor
82	5.8	2.7	3.9	1.2	versicolor
83	6.0	2.7	5.1	1.6	versicolor
84	5.4	3.0	4.5	1.5	versicolor
85	6.0	3.4	4.5	1.6	versicolor
86	6.7	3.1	4.7	1.5	versicolor
87	6.3	2.3	4.4	1.3	versicolor
88	5.6	3.0	4.1	1.3	versicolor
89	5.5	2.5	4.0	1.3	versicolor
90	5.5	2.6	4.4	1.2	versicolor
91	6.1	3.0	4.6	1.4	versicolor
92	5.8	2.6	4.0	1.2	versicolor
93	5.0	2.3	3.3	1.0	versicolor
94	5.6	2.7	4.2	1.3	versicolor
95	5.7	3.0	4.2	1.2	versicolor
96	5.7	2.9	4.2	1.3	versicolor
97	6.2	2.9	4.3	1.3	versicolor
98	5.1	2.5	3.0	1.1	versicolor
99	5.7	2.8	4.1	1.3	versicolor

	user	delay_ms
0	root	160.264908
1	root	210.451747
2	root	134.348377
3	guest	185.193624
4	guest	255.948266

	timestamp	user	ip	status	delay_ms
0	2025-11-06 00:00:00	guest	192.168.0.9	fail	726
1	2025-11-06 01:00:00	superAdmin	192.168.0.3	success	798
2	2025-11-06 02:00:00	admin	192.168.0.3	success	585
3	2025-11-06 03:00:00	analyst	192.168.0.5	success	97
4	2025-11-06 04:00:00	admin	192.168.0.1	success	756

	manufacturer	model	displ	year	cyl	trans	drv	cty	hwy	fl	class
1	audi	a4	1.8	1999	4	auto(l5)	f	18	29	p	compact
2	audi	a4	1.8	1999	4	manual(m5)	f	21	29	p	compact
3	audi	a4	2.0	2008	4	manual(m6)	f	20	31	p	compact
4	audi	a4	2.0	2008	4	auto(av)	f	21	30	p	compact
5	audi	a4	2.8	1999	6	auto(l5)	f	16	26	p	compact

	time	ip	country	attack_type	description
0	2025-11-04 06:13:00	190.185.214.32	Russia	Brute Force	여러 번의 로그인 실패 시도 관찰됨
1	2025-11-06 17:06:00	120.136.155.120	France	CSRF	크로스사이트 요청 위조 시도 감지
2	2025-11-05 16:40:00	130.16.114.226	UK	CSRF	크로스사이트 요청 위조 시도 감지
3	2025-11-03 10:52:00	17.99.212.168	France	Phishing	악성 이메일 링크 탐지
4	2025-11-02 08:32:00	101.167.18.159	UK	Phishing	악성 이메일 링크 탐지