CatBoost Property Test
Yun Han 7/30/2019
1. Epilson
!pip install catboost
from catboost.datasets import epsilon
train, test = epsilon()
X_train, y_train = train.iloc[:,1:], train[0]
X_test, y_test = test.iloc[:,1:], test[0]
X_train.shape
(400000, 2000)
X_test.shape
(100000, 2000)
!pip install -U -q PyDrive
[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 993kB 9.6MB/s
[?25h Building wheel for PyDrive (setup.py) ... [?25l[?25hdone
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
# This only needs to be done once in a notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
# Create & upload a text file.
#ไฝ ๆณ่ฆๅฏผๅบ็ๆไปถ็ๅๅญ
uploaded = drive.CreateFile({'title': 'OK.csv'})
#ๆนไธบไนๅ็ๆๆไปถ็ๅๅญ
uploaded.SetContentFile('over.csv')
uploaded.Upload()
print('Uploaded file with ID {}'.format(uploaded.get('id')))
Uploaded file with ID 1DPAcA9VzfP6HuhhSt0ifk9vXSsdCgobH
X_train.shape
(400000, 2000)
Training on CPU
from catboost import CatBoostClassifier
import timeit
def train_on_cpu():
model = CatBoostClassifier(
iterations=1000,
learning_rate=0.03,
boosting_type = 'Plain'
)
model.fit(
X_train, y_train,
eval_set=(X_test, y_test),
verbose=10
);
cpu_time = timeit.timeit('train_on_cpu()',
setup="from __main__ import train_on_cpu",
number=1)
print('Time to fit model on CPU: {} sec'.format(int(cpu_time)))
Training on GPU
from catboost import CatBoostClassifier
import timeit
def train_on_gpu():
model = CatBoostClassifier(
iterations=1000,
learning_rate=0.03,
boosting_type = 'Plain',
task_type='GPU'
)
model.fit(
X_train, y_train,
eval_set=(X_test, y_test),
verbose=10
);
gpu_time = timeit.timeit('train_on_gpu()',
setup="from __main__ import train_on_gpu",
number=1)
print('Time to fit model on GPU: {} sec'.format(int(gpu_time)))
print('GPU speedup over CPU: ' + '%.2f' % (cpu_time/gpu_time) + 'x')
from catboost import CatBoostClassifier
classifier_cat = CatBoostClassifier(iterations = 100, task_type = 'GPU')
# Train the model
classifier_cat.fit(X_train, y_train)
# Prediction of test data
classifier_cat.predict(X_test)
# Accuracy of test data
classifier_cat.score(X_test, y_test)
import lightgbm as lgb
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
# specify your configurations as a dict
params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': {'binary_logloss'},
'learning_rate': 0.03
}
# Use time function to measure time elapsed
import time
start = time.time()
gbm = lgb.train(params,
lgb_train,
num_boost_round=1000,
valid_sets=lgb_train)
end = time.time()
print(end - start)
2. Credit card lost
from google.colab import files
uploaded = files.upload()
<input type="file" id="files-e6110ee2-0f38-4084-a083-baf51740d777" name="files[]" multiple disabled />
<output id="result-e6110ee2-0f38-4084-a083-baf51740d777">
Upload widget is only available when the cell has been executed in the
current browser session. Please rerun this cell to enable.
</output>
<script src="/nbextensions/google.colab/files.js"></script>
Saving creditcard lost.csv to creditcard lost.csv
import io
import pandas as pd
df = pd.read_csv(io.BytesIO(uploaded['creditcard lost.csv']))
#df.head()
df.head()
y | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | x13 | x14 | x15 | x16 | x17 | x18 | x19 | x20 | x21 | x22 | x23 | x24 | x25 | x26 | x27 | x28 | x29 | x30 | x31 | x32 | x33 | x34 | x35 | x36 | x37 | x38 | x39 | ... | x81 | x82 | x83 | x84 | x85 | x86 | x87 | x88 | x89 | x90 | x91 | x92 | x93 | x94 | x95 | x96 | x97 | x98 | x99 | x100 | x101 | x102 | x103 | x104 | x105 | x106 | x107 | x108 | x109 | x110 | x111 | x112 | x113 | x114 | x115 | x116 | x117 | x118 | x119 | x120 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 5.04 | 5.77 | 6.04 | 3.91 | 0.66 | 1.04 | 1.83 | 5.41 | 5.45 | 6.91 | 6.70 | 1.17 | 3.08 | 3.44 | 1.83 | 5.32 | 5.16 | 4.62 | 4.58 | 3.98 | 6.57 | 5.18 | 1.79 | 2.99 | 7.95 | 21.02 | 1.70 | 0.00 | 4.55 | 6.90 | 0.00 | 0.00 | 0.00 | 2.25 | 3.16 | 5.85 | 6.26 | 5.06 | 0.00 | ... | 4.94 | 5.75 | 6.23 | 5.80 | 1.45 | 2.63 | 2.19 | 5.18 | 8.17 | 3.39 | 1.14 | 7.95 | 7.95 | 8.52 | 3.41 | 3.98 | 0.0 | 3.45 | 0.00 | 0.0 | 3.45 | 0.00 | 0.0 | 0.00 | 20.69 | 7.94 | 7.70 | 2.97 | 6.88 | 4.54 | 0.68 | 7.31 | 8.04 | 10.73 | 5.58 | 0.00 | 7.52 | 5.37 | 6.18 | 4.24 |
1 | 1 | 5.98 | 2.90 | 1.94 | 1.42 | 0.10 | 3.29 | 0.81 | 9.09 | 8.04 | 4.65 | 2.65 | 0.46 | 2.25 | 6.63 | 0.00 | 6.19 | 5.06 | 0.56 | 4.80 | 6.55 | 3.49 | 0.44 | 0.00 | 3.93 | 10.62 | 4.28 | 3.83 | 3.98 | 7.52 | 24.07 | 9.26 | 0.00 | 0.00 | 0.47 | 0.04 | 10.31 | 9.35 | 4.45 | 0.04 | ... | 5.17 | 7.54 | 13.27 | 3.82 | 0.00 | 0.34 | 0.00 | 10.48 | 6.55 | 0.44 | 0.00 | 12.83 | 12.83 | 5.31 | 2.51 | 1.33 | 0.0 | 4.63 | 1.85 | 0.0 | 2.78 | 2.78 | 0.0 | 0.00 | 0.93 | 12.21 | 6.96 | 0.10 | 4.20 | 0.88 | 0.19 | 10.44 | 7.17 | 2.96 | 6.75 | 0.17 | 5.40 | 8.32 | 5.40 | 1.57 |
2 | 1 | 7.04 | 8.92 | 3.24 | 0.01 | 0.00 | 4.50 | 0.00 | 8.41 | 6.97 | 12.65 | 9.24 | 0.00 | 0.54 | 10.11 | 0.06 | 10.43 | 12.13 | 1.42 | 13.91 | 10.66 | 3.73 | 3.05 | 0.00 | 2.61 | 2.86 | 8.57 | 10.00 | 0.00 | 8.57 | 11.11 | 3.70 | 0.00 | 3.70 | 0.00 | 0.00 | 9.51 | 5.19 | 6.20 | 0.00 | ... | 7.11 | 8.28 | 8.66 | 5.33 | 0.00 | 0.13 | 0.00 | 6.01 | 6.16 | 0.08 | 0.00 | 10.00 | 7.14 | 11.43 | 20.00 | 2.86 | 0.0 | 7.41 | 14.81 | 0.0 | 0.00 | 0.00 | 0.0 | 0.00 | 11.11 | 8.47 | 9.45 | 0.00 | 4.60 | 0.43 | 0.00 | 3.77 | 2.70 | 6.84 | 7.28 | 0.00 | 5.60 | 7.52 | 5.93 | 3.89 |
3 | 1 | 13.30 | 7.16 | 5.69 | 1.98 | 0.71 | 2.20 | 0.22 | 13.89 | 1.75 | 0.00 | 3.27 | 0.00 | 10.26 | 9.46 | 4.42 | 4.14 | 4.08 | 4.32 | 4.22 | 4.11 | 3.83 | 4.08 | 4.36 | 3.99 | 7.96 | 6.19 | 5.31 | 4.42 | 11.50 | 32.14 | 3.57 | 0.00 | 0.00 | 0.58 | 0.13 | 5.83 | 1.45 | 11.60 | 0.00 | ... | 4.11 | 4.15 | 4.13 | 4.20 | 4.37 | 4.30 | 4.54 | 4.15 | 4.11 | 4.28 | 0.00 | 9.73 | 4.42 | 3.54 | 7.96 | 4.42 | 0.0 | 7.14 | 3.57 | 0.0 | 0.00 | 0.00 | 0.0 | 0.00 | 3.57 | 2.95 | 7.28 | 0.00 | 9.54 | 5.71 | 0.48 | 4.77 | 1.13 | 2.67 | 8.83 | 0.00 | 4.09 | 4.14 | 4.16 | 4.23 |
4 | 1 | 5.24 | 5.09 | 5.31 | 1.05 | 0.00 | 4.24 | 0.34 | 5.25 | 5.59 | 9.58 | 3.32 | 0.00 | 5.85 | 10.89 | 2.16 | 6.13 | 5.88 | 4.22 | 5.15 | 5.54 | 5.85 | 4.68 | 1.15 | 1.54 | 9.42 | 7.48 | 2.77 | 3.88 | 6.65 | 13.64 | 3.03 | 4.55 | 4.55 | 0.24 | 0.08 | 8.67 | 8.11 | 4.08 | 0.00 | ... | 5.93 | 6.01 | 6.05 | 6.07 | 1.64 | 3.17 | 2.25 | 6.44 | 6.60 | 2.91 | 0.28 | 9.14 | 9.42 | 4.99 | 3.60 | 2.49 | 0.0 | 15.15 | 1.52 | 0.0 | 0.00 | 0.00 | 0.0 | 4.55 | 9.09 | 7.24 | 7.63 | 0.00 | 5.11 | 3.71 | 0.00 | 6.01 | 4.16 | 4.49 | 6.14 | 0.00 | 6.18 | 6.10 | 6.06 | 5.86 |
5 rows ร 121 columns
X
array([[5.04, 5.77, 6.04, ..., 5.37, 6.18, 4.24],
[5.98, 2.9 , 1.94, ..., 8.32, 5.4 , 1.57],
[7.04, 8.92, 3.24, ..., 7.52, 5.93, 3.89],
...,
[2.74, 5.42, 8.79, ..., 6.75, 4.98, 5.21],
[7.51, 6.76, 4.73, ..., 4.94, 5. , 4.93],
[3.81, 6.79, 7.66, ..., 5.17, 5.06, 4.72]])
# Get X and y
X = df.iloc[:, 1:].values
y = df.iloc[:, 0].values
# Split data into training and testing
from sklearn import model_selection
# Reserve 20% for testing
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.2)
print('Training data has %d observation with %d features' % X_train.shape)
print('Test data has %d observation with %d features' % X_test.shape)
Training data has 12000 observation with 120 features
Test data has 3000 observation with 120 features
!pip install catboost
from catboost import CatBoostClassifier
classifier_cat = CatBoostClassifier(iterations = 100, task_type = 'GPU')
X_train.shape
(12000, 120)
import timeit
def train_on_cpu():
model = CatBoostClassifier(
iterations=1000,
eval_metric = 'AUC',
boosting_type = 'Plain',
learning_rate=0.03
)
model.fit(
X_train, y_train,
eval_set=(X_test, y_test),
verbose=100
);
cpu_time = timeit.timeit('train_on_cpu()',
setup="from __main__ import train_on_cpu",
number=1)
print('Time to fit model on CPU: {} sec'.format(int(cpu_time)))
0: learn: 0.6676061 test: 0.6675290 best: 0.6675290 (0) total: 38ms remaining: 37.9s
100: learn: 0.3157643 test: 0.3226858 best: 0.3226858 (100) total: 4.22s remaining: 37.6s
200: learn: 0.3003360 test: 0.3215573 best: 0.3214868 (186) total: 8.91s remaining: 35.4s
300: learn: 0.2866442 test: 0.3216649 best: 0.3214678 (221) total: 13.5s remaining: 31.5s
400: learn: 0.2729592 test: 0.3216150 best: 0.3214438 (359) total: 18.2s remaining: 27.1s
500: learn: 0.2592651 test: 0.3218496 best: 0.3214438 (359) total: 22.8s remaining: 22.7s
600: learn: 0.2460502 test: 0.3213318 best: 0.3212648 (585) total: 27.4s remaining: 18.2s
700: learn: 0.2329541 test: 0.3218720 best: 0.3212648 (585) total: 32.1s remaining: 13.7s
800: learn: 0.2210274 test: 0.3218479 best: 0.3212648 (585) total: 36.8s remaining: 9.15s
900: learn: 0.2093181 test: 0.3221360 best: 0.3212648 (585) total: 41.6s remaining: 4.57s
999: learn: 0.1988029 test: 0.3234133 best: 0.3212648 (585) total: 46.3s remaining: 0us
bestTest = 0.3212648222
bestIteration = 585
Shrink model to first 586 iterations.
Time to fit model on CPU: 46 sec
def train_on_gpu():
model = CatBoostClassifier(
iterations=1000,
learning_rate=0.03,
eval_metric = 'AUC',
boosting_type = 'Plain',
task_type='GPU'
)
model.fit(
X_train, y_train,
eval_set=(X_test, y_test),
verbose=100
);
gpu_time = timeit.timeit('train_on_gpu()',
setup="from __main__ import train_on_gpu",
number=1)
print('Time to fit model on GPU: {} sec'.format(int(gpu_time)))
print('GPU speedup over CPU: ' + '%.2f' % (cpu_time/gpu_time) + 'x')
0: learn: 0.5710810 test: 0.5505786 best: 0.5505786 (0) total: 22.9ms remaining: 22.9s
100: learn: 0.8201020 test: 0.5631405 best: 0.5827100 (49) total: 2.35s remaining: 20.9s
200: learn: 0.8866887 test: 0.5592176 best: 0.5827100 (49) total: 4.79s remaining: 19.1s
300: learn: 0.9317816 test: 0.5518306 best: 0.5827100 (49) total: 7.2s remaining: 16.7s
400: learn: 0.9584097 test: 0.5486118 best: 0.5827100 (49) total: 9.55s remaining: 14.3s
500: learn: 0.9745375 test: 0.5483546 best: 0.5827100 (49) total: 12s remaining: 12s
600: learn: 0.9842969 test: 0.5495813 best: 0.5827100 (49) total: 14.5s remaining: 9.62s
700: learn: 0.9908361 test: 0.5486887 best: 0.5827100 (49) total: 16.8s remaining: 7.17s
800: learn: 0.9940755 test: 0.5510603 best: 0.5827100 (49) total: 19.2s remaining: 4.76s
900: learn: 0.9961302 test: 0.5465447 best: 0.5827100 (49) total: 21.5s remaining: 2.36s
999: learn: 0.9976045 test: 0.5430479 best: 0.5827100 (49) total: 23.9s remaining: 0us
bestTest = 0.5827099681
bestIteration = 49
Shrink model to first 50 iterations.
Time to fit model on GPU: 26 sec
GPU speedup over CPU: 1.79x
import lightgbm as lgb
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
# specify your configurations as a dict
params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': {'binary_logloss'},
'learning_rate': 0.03
}
# Use time function to measure time elapsed
import time
start = time.time()
gbm = lgb.train(params,
lgb_train,
num_boost_round=1000,
valid_sets=lgb_train)
end = time.time()
print(end - start)
3. Customer Satisfaction
from google.colab import files
uploaded = files.upload()
<input type="file" id="files-e5655230-bf1b-40c8-8ab5-975d6b884627" name="files[]" multiple disabled />
<output id="result-e5655230-bf1b-40c8-8ab5-975d6b884627">
Upload widget is only available when the cell has been executed in the
current browser session. Please rerun this cell to enable.
</output>
<script src="/nbextensions/google.colab/files.js"></script>
Saving customer_satisfaction.csv to customer_satisfaction.csv
import io
import pandas as pd
data = pd.read_csv(io.BytesIO(uploaded['customer_satisfaction.csv']))
data.head()
ID | var3 | var15 | imp_ent_var16_ult1 | imp_op_var39_comer_ult1 | imp_op_var39_comer_ult3 | imp_op_var40_comer_ult1 | imp_op_var40_comer_ult3 | imp_op_var40_efect_ult1 | imp_op_var40_efect_ult3 | imp_op_var40_ult1 | imp_op_var41_comer_ult1 | imp_op_var41_comer_ult3 | imp_op_var41_efect_ult1 | imp_op_var41_efect_ult3 | imp_op_var41_ult1 | imp_op_var39_efect_ult1 | imp_op_var39_efect_ult3 | imp_op_var39_ult1 | imp_sal_var16_ult1 | ind_var1_0 | ind_var1 | ind_var2_0 | ind_var2 | ind_var5_0 | ind_var5 | ind_var6_0 | ind_var6 | ind_var8_0 | ind_var8 | ind_var12_0 | ind_var12 | ind_var13_0 | ind_var13_corto_0 | ind_var13_corto | ind_var13_largo_0 | ind_var13_largo | ind_var13_medio_0 | ind_var13_medio | ind_var13 | ... | saldo_medio_var5_ult1 | saldo_medio_var5_ult3 | saldo_medio_var8_hace2 | saldo_medio_var8_hace3 | saldo_medio_var8_ult1 | saldo_medio_var8_ult3 | saldo_medio_var12_hace2 | saldo_medio_var12_hace3 | saldo_medio_var12_ult1 | saldo_medio_var12_ult3 | saldo_medio_var13_corto_hace2 | saldo_medio_var13_corto_hace3 | saldo_medio_var13_corto_ult1 | saldo_medio_var13_corto_ult3 | saldo_medio_var13_largo_hace2 | saldo_medio_var13_largo_hace3 | saldo_medio_var13_largo_ult1 | saldo_medio_var13_largo_ult3 | saldo_medio_var13_medio_hace2 | saldo_medio_var13_medio_hace3 | saldo_medio_var13_medio_ult1 | saldo_medio_var13_medio_ult3 | saldo_medio_var17_hace2 | saldo_medio_var17_hace3 | saldo_medio_var17_ult1 | saldo_medio_var17_ult3 | saldo_medio_var29_hace2 | saldo_medio_var29_hace3 | saldo_medio_var29_ult1 | saldo_medio_var29_ult3 | saldo_medio_var33_hace2 | saldo_medio_var33_hace3 | saldo_medio_var33_ult1 | saldo_medio_var33_ult3 | saldo_medio_var44_hace2 | saldo_medio_var44_hace3 | saldo_medio_var44_ult1 | saldo_medio_var44_ult3 | var38 | TARGET | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 2 | 23 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0.00 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.00 | 0.0 | 0.00 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 39205.170000 | 0 |
1 | 3 | 2 | 34 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0.00 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.00 | 300.0 | 122.22 | 300.0 | 240.75 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 49278.030000 | 0 |
2 | 4 | 2 | 23 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 3.00 | 2.07 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.00 | 0.0 | 0.00 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 67333.770000 | 0 |
3 | 8 | 2 | 37 | 0.0 | 195.0 | 195.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 195.0 | 195.0 | 0.0 | 0.0 | 195.0 | 0.0 | 0.0 | 195.0 | 0.0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 91.56 | 138.84 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.00 | 0.0 | 0.00 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 64007.970000 | 0 |
4 | 10 | 2 | 39 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 40501.08 | 13501.47 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 85501.89 | 85501.89 | 0.0 | 0.00 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 117310.979016 | 0 |
5 rows ร 371 columns
# Get X and y
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values
# Split data into training and testing
from sklearn import model_selection
# Reserve 20% for testing
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.2)
print('Training data has %d observation with %d features' % X_train.shape)
print('Test data has %d observation with %d features' % X_test.shape)
Training data has 60816 observation with 370 features
Test data has 15204 observation with 370 features
3.1 CatBoost
Train in CPU
from catboost import CatBoostClassifier
classifier_cat = CatBoostClassifier(iterations = 100, task_type = 'GPU')
import timeit
def train_on_cpu():
model = CatBoostClassifier(
iterations=1000,
learning_rate=0.03,
boosting_type = 'Plain'
)
model.fit(
X_train, y_train,
eval_set=(X_test, y_test),
verbose=100
);
cpu_time = timeit.timeit('train_on_cpu()',
setup="from __main__ import train_on_cpu",
number=1)
print('Time to fit model on CPU: {} sec'.format(int(cpu_time)))
0: learn: 0.6499884 test: 0.6502590 best: 0.6502590 (0) total: 97.6ms remaining: 1m 37s
100: learn: 0.1356915 test: 0.1439829 best: 0.1439829 (100) total: 11.9s remaining: 1m 46s
200: learn: 0.1312296 test: 0.1412094 best: 0.1412094 (200) total: 23.3s remaining: 1m 32s
300: learn: 0.1295643 test: 0.1405838 best: 0.1405772 (299) total: 34.2s remaining: 1m 19s
400: learn: 0.1280686 test: 0.1401968 best: 0.1401918 (399) total: 45.2s remaining: 1m 7s
500: learn: 0.1265318 test: 0.1399162 best: 0.1399123 (495) total: 56.4s remaining: 56.2s
600: learn: 0.1251861 test: 0.1397071 best: 0.1397071 (600) total: 1m 7s remaining: 44.8s
700: learn: 0.1239926 test: 0.1396878 best: 0.1396694 (695) total: 1m 18s remaining: 33.5s
800: learn: 0.1225489 test: 0.1395715 best: 0.1395640 (797) total: 1m 30s remaining: 22.4s
900: learn: 0.1212882 test: 0.1395614 best: 0.1395459 (852) total: 1m 41s remaining: 11.2s
999: learn: 0.1201106 test: 0.1395218 best: 0.1395129 (989) total: 1m 52s remaining: 0us
bestTest = 0.1395128933
bestIteration = 989
Shrink model to first 990 iterations.
Time to fit model on CPU: 117 sec
Train in GPU
def train_on_gpu():
model = CatBoostClassifier(
iterations=1000,
learning_rate=0.03,
task_type='GPU'
)
model.fit(
X_train, y_train,
eval_set=(X_test, y_test),
verbose=100
);
gpu_time = timeit.timeit('train_on_gpu()',
setup="from __main__ import train_on_gpu",
number=1)
print('Time to fit model on GPU: {} sec'.format(int(gpu_time)))
print('GPU speedup over CPU: ' + '%.2f' % (cpu_time/gpu_time) + 'x')
0: learn: 0.6420095 test: 0.6421342 best: 0.6421342 (0) total: 12.8ms remaining: 12.8s
100: learn: 0.1350072 test: 0.1434604 best: 0.1434604 (100) total: 1.01s remaining: 9.04s
200: learn: 0.1307862 test: 0.1409726 best: 0.1409726 (200) total: 1.88s remaining: 7.46s
300: learn: 0.1288554 test: 0.1404405 best: 0.1404258 (294) total: 2.75s remaining: 6.4s
400: learn: 0.1271075 test: 0.1399712 best: 0.1399712 (400) total: 3.63s remaining: 5.43s
500: learn: 0.1255655 test: 0.1397720 best: 0.1397680 (486) total: 4.51s remaining: 4.5s
600: learn: 0.1243287 test: 0.1396310 best: 0.1396310 (600) total: 5.57s remaining: 3.7s
700: learn: 0.1229532 test: 0.1396375 best: 0.1395841 (667) total: 6.65s remaining: 2.84s
800: learn: 0.1216558 test: 0.1396176 best: 0.1395823 (772) total: 7.75s remaining: 1.93s
900: learn: 0.1204271 test: 0.1396710 best: 0.1395823 (772) total: 8.86s remaining: 973ms
999: learn: 0.1191901 test: 0.1397230 best: 0.1395823 (772) total: 10s remaining: 0us
bestTest = 0.1395823121
bestIteration = 772
Shrink model to first 773 iterations.
Time to fit model on GPU: 15 sec
GPU speedup over CPU: 7.48x
3.2 LGBM
Train in LGBM
import lightgbm as lgb
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
# specify your configurations as a dict
params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': {'binary_logloss'},
'learning_rate': 0.03
}
# Use time function to measure time elapsed
import time
start = time.time()
gbm = lgb.train(params,
lgb_train,
num_boost_round=1000,
valid_sets=lgb_train)
end = time.time()
print(end - start)