피쳐선택

import numpy as np; import pandas as pd
from sklearn.datasets import load_diabetes
 
# 데이터 로드 및 분할
# 1. 데이터 분리 및 인코딩
df = load_diabetes()
y = df.target; X = pd.DataFrame(df.data, columns=df.feature_names)
# y = df1_solve['Target']; X = df1_solve.drop('Target', axis=1)
 
 
# 범주형 변수를 원핫 인코딩으로 변환 (결측치 처리 및 스케일링이 끝난 상태라 가정)
# 연속형 변수는 영향을 받지 않고 그대로 유지됨
X_encoded = pd.get_dummies(X, drop_first=True)
 
 
# 학습용/테스트용 데이터 분할
from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

트리 기반 모델(Random Forest), 특성 선택(Feature Selection)

from sklearn.ensemble import RandomForestClassifier
 
# 2. 모델 학습
# 특성 선택이 목적이므로 하이퍼파라미터 튜닝 전 기본 모델을 사용
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

FAMD

df['target'].astype('category')로 변환할 경우, Pandas는 이를 범주형으로 취급하지만 카테고리의 내부(underlying) 데이터 타입은 여전히 정수형이다.

prince.FAMD는 내부적으로 다중 대응 분석(MCA)을 위해 범주형 변수들로 지시 행렬(Indicator Matrix)을 생성(내부적으로 pd.get_dummies 등 활용)

이때 범주값이 순수 문자열(str 또는 object)이 아닐 경우 데이터 타입의 일관성이 깨졌다고 판단하여 연산을 중단한다.

import prince
 
# 1. FAMD 모델 객체 초기화
famd = prince.FAMD(
    n_components=2, # 축소할 차원(주성분)의 수
    n_iter=3,       # SVD 연산 반복 횟수
    random_state=42 # 결과 재현성을 위한 시드 고정
)
 
# 2. 혼합 데이터프레임 학습 및 변환 적용 (df1_solve 활용)
# df['target'] = df['target'].astype(str)
famd_result = famd.fit_transform(df)
famd_result.head(3)

	0	1
0	9.653790	-2.937826
1	9.634149	-2.916487
2	9.657403	-2.923354

Drop-Column Importance

특성	Drop Column	Permutation	Impurity-based	SHAP
동작 알고리즘	특정 피처를 데이터셋에서 완전히 제외한 후 모델을 재학습시켜, 전체 피처를 사용한 기준 모델(Base Model)과의 성능 지표(예: RMSE, Accuracy 등) 차이를 해당 피처의 중요도로 산출한다.	학습 완료된 모델을 대상으로 특정 피처의 행 데이터만 무작위로 섞어(Shuffle) 기존 타겟과의 상관관계를 파괴한 뒤 예측을 수행하고, 기준 모델 대비 성능 하락폭을 계산하여 중요도를 산출한다.	트리 계열 모델 학습 과정 중, 특정 피처가 각 노드를 분할할 때 발생하는 불순도(Gini Index, Entropy 등) 감소량에 도달한 샘플 비율을 가중치로 곱하여 합산한 값을 중요도로 산출한다.	가능한 모든 피처 조합의 부분집합(Coalition)을 구성한 뒤, 특정 피처가 포함되었을 때와 제외되었을 때의 예측값 차이(Marginal Contribution)를 계산하고, 이에 대한 가중 평균(Shapley Value)을 구하여 중요도로 산출한다.
모델재학습	필요	불필요	불필요 (학습 시 자동 산출)	불필요
연산 비용	매우 높음	낮음 (예측만 수행)	매우 낮음	매우 높음 (트리 제외)
해석 관점	특정 변수 완전 부재 시의 성능 변화	특정 변수 정보가 노이즈로 변할 때의 성능 변화	훈련 데이터 노드 분할 시 불순도 감소 기여	개별 예측값에 대한 한계 기여도의 게임 이론적 분배
주요 장점	특정 변수의 유무가 모델 성능에 미치는 한계 효과(Marginal effect)를 가장 정확하고 직접적으로 측정	재학습이 필요 없어 DCI 에 비해 연산 비용이 적다, 모델 종류에 구애받지 않음 (Model-agnostic)	학습 시 자동 산출되므로 별도 계산 비용 없음	변수의 전역적(Global) 중요도뿐만 아니라 국소적 해석(Local) 가능, 게임 이론 기반의 일관성 보장
주요 단점	변수가 p개 존재할 경우, 전체 모델 학습 1회와 개별 변수 제거 후 학습 p회를 합쳐 총 p+1번의 모델 학습 - 연산 소요 극심, 다중공선성 시 과소평가	다중공선성 높은 변수 그룹이 있을시 비현실적 데이터 분포 생성 위험	연속형 변수나 카디널리티 높은 변수에 중요도가 과적합 편향되는 통계적 결함, 훈련 데이터에 대한 불순도 감소량을 보므로 과적합된 변수를 중요한 것으로 착각할 수 있음	엄밀한 계산 시 연산량 과다, 설명 모델 복잡성 (단, 트리에 특화된 TreeSHAP을 사용하면 속도 개선이 가능)

DCI: 다중공선성(Multicollinearity)의 영향: 상관관계가 높은 변수들이 존재할 경우, 하나의 변수를 제거해도 다른 변수가 그 정보를 대체할 수 있으므로 중요도가 과소평가될 위험이 있다.
IPI: 랜덤 포레스트 등 트리 기반(Tree-based) 모델에서 특정 변수가 노드를 분할할 때 지니 불순도(Gini Impurity)나 정보 획득량(Information Gain)을 얼마나 개선했는지 측정하여 가중 평균을 낸다.
SHAP: 협조적 게임 이론의 섀플리 값(Shapley Value)을 머신러닝에 적용한 기법으로, 모든 가능한 변수 조합에 대해 특정 변수가 추가되었을 때의 한계 기여도를 평균 내어 산출

if 'Drop Column Importance':
 
 
    if '베이스모델 학습, 작성':
        from sklearn.ensemble import RandomForestRegressor
 
        # 기본 모델 학습
        model = RandomForestRegressor(random_state=42)
        model.fit(X_train, y_train)
 
    if '베이스모델 점수 체크':
        from sklearn.metrics import r2_score
 
        # 전체 변수를 사용한 베이스라인 점수 계산
        y_pred_full = model.predict(X_test)
        score_full = r2_score(y_test, y_pred_full)
 
    drop_importances = {}
 
    # 각 변수를 순회하며 하나씩 제거 후 재학습 및 평가
    for col in X_train.columns:
        X_train_drop = X_train.drop(columns=[col])
        X_test_drop = X_test.drop(columns=[col])
        
        # 모델 재학습
        model_drop = RandomForestRegressor(random_state=42)
        model_drop.fit(X_train_drop, y_train)
        
        # 성능 측정 및 중요도 산출
        y_pred_drop = model_drop.predict(X_test_drop)
        score_drop = r2_score(y_test, y_pred_drop)
        
        drop_importances[col] = score_full - score_drop
 
    # 결과 출력
    print("Drop Column Importance:")
    for k, v in sorted(drop_importances.items(), key=lambda item: item[1], reverse=True):
        print(f"{k}: {v:.4f}")
 
 
 
    '''
 
    from sklearn.feature_selection import SequentialFeatureSelector
    from sklearn.ensemble import RandomForestRegressor
 
    model = RandomForestRegressor(random_state=42)
 
    # 후진 선택법(Backward Selection) 설정
    # n_features_to_select='auto'로 설정하면 특정 임계값 또는 절반을 선택
    # cv 파라미터로 교차 검증 폴드 수 지정
    sfs = SequentialFeatureSelector(
        model, 
        n_features_to_select=5, # 남길 변수의 개수 지정
        direction='backward',   # 후진 제거 (Drop 논리)
        scoring='r2',
        cv=3
    )
 
    sfs.fit(X_train, y_train)
 
    # Drop Column 논리를 통과하여 최종적으로 살아남은 변수 마스크 출력
    selected_features = X.columns[sfs.get_support()]
    print("Drop Column 기법(Backward SFS)을 통해 선택된 변수:", selected_features)
    '''
 
 
 
if 'permutation':
    from sklearn.inspection import permutation_importance
 
    # 순열 중요도 산출 (n_repeats: 섞는 횟수 K)
    result = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42, scoring='r2')
 
    # 결과 출력 (평균 중요도 기준 정렬)
    perm_importances = pd.Series(result.importances_mean, index=X.columns)
    print("\nPermutation Importance:")
    print(perm_importances.sort_values(ascending=False))
 
 
 
 
 
if 'Impurity-based Importance':
 
    # 학습된 모델의 feature_importances_ 속성 호출
    impurity_importances = pd.Series(model.feature_importances_, index=X.columns)
 
    print(impurity_importances.sort_values(ascending=False))
 
 
 
 
 
if 'SHAP':
 
    import shap
 
    # 1. TreeExplainer 객체 생성 (학습된 트리 모델 입력)
    explainer = shap.TreeExplainer(model)
 
    # 2. X_test 데이터에 대한 SHAP 값 계산
    shap_values = explainer.shap_values(X_test)
 
    # 3. 전역적(Global) 피처 중요도 산출 (각 피처별 SHAP 절대값의 평균)
    shap_importances = np.abs(shap_values).mean(axis=0)
    shap_series = pd.Series(shap_importances, index=X.columns)
 
    print("\nSHAP Global Importance (Mean Absolute SHAP Value):")
    print(shap_series.sort_values(ascending=False))
 
    # (참고) SHAP Summary Plot 시각화
    # shap.summary_plot(shap_values, X_test)

song-ps

Explorer

071. 피쳐선택

피쳐선택

트리 기반 모델(Random Forest), 특성 선택(Feature Selection)

FAMD

Drop-Column Importance

Graph View

Table of Contents

Backlinks