1. Scikit-learn 시작
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
Getting Started
2. 데이터 분할 (Train/Test Split)
# 데이터 준비
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
y = np.array([1, 2, 3, 4])
# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
3. 데이터 스케일링 (Standard Scaling)
# 스케일러 초기화
scaler = StandardScaler()
# 스케일링 적용 (학습 데이터에 맞춰)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
4. 선형 회귀 모델 (Linear Regression)
# 모델 초기화
model = LinearRegression()
# 모델 학습
model.fit(X_train, y_train)
# 예측
y_pred = model.predict(X_test)
1.1. Linear Models
5. 모델 평가 (Model Evaluation)
# MSE 계산
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
# R^2 계산
r2 = r2_score(y_test, y_pred)
print(f'R^2 Score: {r2}')
3.4. Metrics and scoring: quantifying the quality of predictions
6. 분류 모델 (Classification)
from sklearn.datasets import load_iris
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
# 데이터 준비
iris = load_iris()
X = iris.data
y = iris.target
# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 모델 초기화
clf = SVC()
# 모델 학습
clf.fit(X_train, y_train)
# 예측
y_pred = clf.predict(X_test)
# 정확도 평가
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
1.4. Support Vector Machines
7. 교차 검증 (Cross-Validation)
from sklearn.model_selection import cross_val_score
# 교차 검증
scores = cross_val_score(clf, X, y, cv=5)
print(f'Cross-Validation Scores: {scores}')
print(f'Average CV Score: {np.mean(scores)}')
3.1. Cross-validation: evaluating estimator performance
8. 하이퍼파라미터 튜닝 (Hyperparameter Tuning)
from sklearn.model_selection import GridSearchCV
# 파라미터 그리드 설정
param_grid = {'C': [0.1, 1, 10], 'gamma': [1, 0.1, 0.01]}
# 그리드 서치
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=2)
grid.fit(X_train, y_train)
# 최적 파라미터 및 점수 출력
print(f'Best Parameters: {grid.best_params_}')
print(f'Best Score: {grid.best_score_}')
3.2. Tuning the hyper-parameters of an estimator
9. 파이프라인 (Pipeline)
from sklearn.pipeline import Pipeline
# 파이프라인 설정
pipeline = Pipeline([
('scaler', StandardScaler()),
('svc', SVC())
# 파이프라인 학습
pipeline.fit(X_train, y_train)
# 예측
y_pred = pipeline.predict(X_test)
6.1. Pipelines and composite estimators
10. 데이터셋 로드 (Loading Datasets)
from sklearn.datasets import load_iris, load_boston
# 아이리스 데이터셋 로드
iris = load_iris()
X_iris, y_iris = iris.data, iris.target
# 보스턴 주택 가격 데이터셋 로드
boston = load_boston()
X_boston, y_boston = boston.data, boston.target
7.1. Toy datasets
