Post

간단한 classification

Breast Cancer Wisconsin (Diagnostic) Data Set

0. Settings

1
2
3
4
5
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
1
2
3
4
5
file_name = '/kaggle/input/breast-cancer-wisconsin-data/data.csv'

df = pd.read_csv(
    file_name,
)

1. EDA

1.1. Preview

1
df.head()
diagnosisradius_meantexture_meanperimeter_meanarea_meansmoothness_meancompactness_meanconcavity_meanconcave points_meansymmetry_mean...radius_worsttexture_worstperimeter_worstarea_worstsmoothness_worstcompactness_worstconcavity_worstconcave points_worstsymmetry_worstfractal_dimension_worst
0117.9910.38122.801001.00.118400.277600.30010.147100.2419...25.3817.33184.602019.00.16220.66560.71190.26540.46010.11890
1120.5717.77132.901326.00.084740.078640.08690.070170.1812...24.9923.41158.801956.00.12380.18660.24160.18600.27500.08902
2119.6921.25130.001203.00.109600.159900.19740.127900.2069...23.5725.53152.501709.00.14440.42450.45040.24300.36130.08758
3111.4220.3877.58386.10.142500.283900.24140.105200.2597...14.9126.5098.87567.70.20980.86630.68690.25750.66380.17300
4120.2914.34135.101297.00.100300.132800.19800.104300.1809...22.5416.67152.201575.00.13740.20500.40000.16250.23640.07678

5 rows × 31 columns

1
df.info()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             569 non-null    float64
 15  area_se                  569 non-null    float64
 16  smoothness_se            569 non-null    float64
 17  compactness_se           569 non-null    float64
 18  concavity_se             569 non-null    float64
 19  concave points_se        569 non-null    float64
 20  symmetry_se              569 non-null    float64
 21  fractal_dimension_se     569 non-null    float64
 22  radius_worst             569 non-null    float64
 23  texture_worst            569 non-null    float64
 24  perimeter_worst          569 non-null    float64
 25  area_worst               569 non-null    float64
 26  smoothness_worst         569 non-null    float64
 27  compactness_worst        569 non-null    float64
 28  concavity_worst          569 non-null    float64
 29  concave points_worst     569 non-null    float64
 30  symmetry_worst           569 non-null    float64
 31  fractal_dimension_worst  569 non-null    float64
 32  Unnamed: 32              0 non-null      float64
dtypes: float64(31), int64(1), object(1)
memory usage: 146.8+ KB
1
df.id.duplicated().sum()
1
0
1
2
# 중복된 id가 없으므로 id column을 제외하고 각 row를 독립적인 데이터로 사용
df = df.drop(['id'], axis=1)
1.1.1. 결측치 처리
1
2
# 마지막 컬럼 제외
df = df.iloc[:, :-1]
1.1.2. Label diagnosis
1
df.diagnosis.value_counts()
1
2
3
4
diagnosis
B    357
M    212
Name: count, dtype: int64
1
2
df['diagnosis'] = df['diagnosis'].apply(lambda x: 1 if x == 'M' else 0)
df.diagnosis.value_counts()
1
2
3
4
diagnosis
0    357
1    212
Name: count, dtype: int64
1.1.3. Correlations
1
corr = df.corr()
1
2
sns.heatmap(corr)
plt.show()

png

2. Predict

2.1. Data Selection

1
corr['diagnosis'].sort_values().tail(10)
1
2
3
4
5
6
7
8
9
10
11
concavity_mean          0.696360
area_mean               0.708984
radius_mean             0.730029
area_worst              0.733825
perimeter_mean          0.742636
radius_worst            0.776454
concave points_mean     0.776614
perimeter_worst         0.782914
concave points_worst    0.793566
diagnosis               1.000000
Name: diagnosis, dtype: float64
1
2
3
idx = corr['diagnosis'].sort_values().tail(10).index
X = df[idx].drop(['diagnosis'], axis=1)
y = df['diagnosis']
1
X.head()
concavity_meanarea_meanradius_meanarea_worstperimeter_meanradius_worstconcave points_meanperimeter_worstconcave points_worst
00.30011001.017.992019.0122.8025.380.14710184.600.2654
10.08691326.020.571956.0132.9024.990.07017158.800.1860
20.19741203.019.691709.0130.0023.570.12790152.500.2430
30.2414386.111.42567.777.5814.910.1052098.870.2575
40.19801297.020.291575.0135.1022.540.10430152.200.1625
1
2
3
4
5
6
fig, axs = plt.subplots(nrows=3, ncols=3, figsize=(13, 10))

for i, col_name in enumerate(X.columns):
    sns.boxplot(y=X[col_name], x=y, ax=axs[i%3][i//3])
    
plt.show()

png

2.2. Machine Learning

1
2
3
4
5
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
1
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
1
2
3
4
pipe = Pipeline([
    ('scale', None),
    ('model', KNeighborsClassifier())
])
1
pipe.get_params()
1
2
3
4
5
6
7
8
9
10
11
12
13
{'memory': None,
 'steps': [('scale', None), ('model', KNeighborsClassifier())],
 'verbose': False,
 'scale': None,
 'model': KNeighborsClassifier(),
 'model__algorithm': 'auto',
 'model__leaf_size': 30,
 'model__metric': 'minkowski',
 'model__metric_params': None,
 'model__n_jobs': None,
 'model__n_neighbors': 5,
 'model__p': 2,
 'model__weights': 'uniform'}
1
2
3
4
5
6
7
8
9
10
11
12
13
param_grid = {
    'model__n_neighbors':[5,10],
    'scale': [StandardScaler(), MinMaxScaler()],
#     'model__class_weight': [{0:1, 1:v/2} for v in range(1,5)]
}



mod = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=3
)
1
mod.fit(X_train, y_train)
1
pd.DataFrame(mod.cv_results_).T
0123
mean_fit_time0.0049180.0045540.0047150.004649
std_fit_time0.0003170.0000550.0000290.00001
mean_score_time0.0128270.0124980.0125170.012951
std_score_time0.0002980.0002770.0000720.000264
param_model__n_neighbors551010
param_scaleStandardScaler()MinMaxScaler()StandardScaler()MinMaxScaler()
params{'model__n_neighbors': 5, 'scale': StandardSca...{'model__n_neighbors': 5, 'scale': MinMaxScale...{'model__n_neighbors': 10, 'scale': StandardSc...{'model__n_neighbors': 10, 'scale': MinMaxScal...
split0_test_score0.9436620.9295770.936620.929577
split1_test_score0.9507040.936620.9436620.943662
split2_test_score0.936620.9436620.936620.93662
mean_test_score0.9436620.936620.9389670.93662
std_test_score0.005750.005750.003320.00575
rank_test_score1323
1
mod.score(X_test, y_test)
1
0.951048951048951
This post is licensed under CC BY 4.0 by the author.