CatBoost Property Test

Yun Han 7/30/2019

1. Epilson

!pip install catboost

from catboost.datasets import epsilon

train, test = epsilon()

X_train, y_train = train.iloc[:,1:], train[0]
X_test, y_test = test.iloc[:,1:], test[0]

X_train.shape

(400000, 2000)

X_test.shape

(100000, 2000)

!pip install -U -q PyDrive

[K     |████████████████████████████████| 993kB 9.6MB/s 
[?25h  Building wheel for PyDrive (setup.py) ... [?25l[?25hdone

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once in a notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
# Create & upload a text file.
#你想要导出的文件的名字
uploaded = drive.CreateFile({'title': 'OK.csv'})
#改为之前生成文件的名字
uploaded.SetContentFile('over.csv')
uploaded.Upload()
print('Uploaded file with ID {}'.format(uploaded.get('id')))

Uploaded file with ID 1DPAcA9VzfP6HuhhSt0ifk9vXSsdCgobH

X_train.shape

(400000, 2000)

Training on CPU

from catboost import CatBoostClassifier
import timeit

def train_on_cpu():  
  model = CatBoostClassifier(
      iterations=1000,
      learning_rate=0.03,
      boosting_type = 'Plain'
  )
  
  model.fit(
      X_train, y_train,
      eval_set=(X_test, y_test),
      verbose=10
  );   
      
cpu_time = timeit.timeit('train_on_cpu()', 
                         setup="from __main__ import train_on_cpu", 
                         number=1)

print('Time to fit model on CPU: {} sec'.format(int(cpu_time)))

Training on GPU

from catboost import CatBoostClassifier
import timeit

def train_on_gpu():  
  model = CatBoostClassifier(
      iterations=1000,
      learning_rate=0.03,
      boosting_type = 'Plain',
      task_type='GPU'
  )
  
  model.fit(
      X_train, y_train,
      eval_set=(X_test, y_test),
      verbose=10
  );     
      
gpu_time = timeit.timeit('train_on_gpu()', 
                         setup="from __main__ import train_on_gpu", 
                         number=1)

print('Time to fit model on GPU: {} sec'.format(int(gpu_time)))
print('GPU speedup over CPU: ' + '%.2f' % (cpu_time/gpu_time) + 'x')

from catboost import CatBoostClassifier
classifier_cat = CatBoostClassifier(iterations = 100, task_type = 'GPU')

# Train the model
classifier_cat.fit(X_train, y_train)

# Prediction of test data
classifier_cat.predict(X_test)

# Accuracy of test data
classifier_cat.score(X_test, y_test)

import lightgbm as lgb
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
# specify your configurations as a dict
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'learning_rate': 0.03
}

# Use time function to measure time elapsed
import time
start = time.time()


gbm = lgb.train(params,
                lgb_train,
                num_boost_round=1000,
                valid_sets=lgb_train)

end = time.time()
print(end - start)

2. Credit card lost

from google.colab import files
uploaded = files.upload()

 <input type="file" id="files-e6110ee2-0f38-4084-a083-baf51740d777" name="files[]" multiple disabled />
 <output id="result-e6110ee2-0f38-4084-a083-baf51740d777">
  Upload widget is only available when the cell has been executed in the
  current browser session. Please rerun this cell to enable.
  </output>
  <script src="/nbextensions/google.colab/files.js"></script> 


Saving creditcard lost.csv to creditcard lost.csv

import io
import pandas as pd

df = pd.read_csv(io.BytesIO(uploaded['creditcard lost.csv']))
#df.head()

df.head()

	y	x1	x2	x3	x4	x5	x6	x7	x8	x9	x10	x11	x12	x13	x14	x15	x16	x17	x18	x19	x20	x21	x22	x23	x24	x25	x26	x27	x28	x29	x30	x31	x32	x33	x34	x35	x36	x37	x38	x39	...	x81	x82	x83	x84	x85	x86	x87	x88	x89	x90	x91	x92	x93	x94	x95	x96	x98	x99	x101	x102	x104	x105	x106	x107	x108	x109	x110	x111	x112	x113	x114	x115	x116	x117	x118	x119	x120
0	1	5.04	5.77	6.04	3.91	0.66	1.04	1.83	5.41	5.45	6.91	6.70	1.17	3.08	3.44	1.83	5.32	5.16	4.62	4.58	3.98	6.57	5.18	1.79	2.99	7.95	21.02	1.70	0.00	4.55	6.90	0.00	0.00	0.00	2.25	3.16	5.85	6.26	5.06	0.00	...	4.94	5.75	6.23	5.80	1.45	2.63	2.19	5.18	8.17	3.39	1.14	7.95	7.95	8.52	3.41	3.98	3.45	0.00	3.45	0.00	0.00	20.69	7.94	7.70	2.97	6.88	4.54	0.68	7.31	8.04	10.73	5.58	0.00	7.52	5.37	6.18	4.24
1	1	5.98	2.90	1.94	1.42	0.10	3.29	0.81	9.09	8.04	4.65	2.65	0.46	2.25	6.63	0.00	6.19	5.06	0.56	4.80	6.55	3.49	0.44	0.00	3.93	10.62	4.28	3.83	3.98	7.52	24.07	9.26	0.00	0.00	0.47	0.04	10.31	9.35	4.45	0.04	...	5.17	7.54	13.27	3.82	0.00	0.34	0.00	10.48	6.55	0.44	0.00	12.83	12.83	5.31	2.51	1.33	4.63	1.85	2.78	2.78	0.00	0.93	12.21	6.96	0.10	4.20	0.88	0.19	10.44	7.17	2.96	6.75	0.17	5.40	8.32	5.40	1.57
2	1	7.04	8.92	3.24	0.01	0.00	4.50	0.00	8.41	6.97	12.65	9.24	0.00	0.54	10.11	0.06	10.43	12.13	1.42	13.91	10.66	3.73	3.05	0.00	2.61	2.86	8.57	10.00	0.00	8.57	11.11	3.70	0.00	3.70	0.00	0.00	9.51	5.19	6.20	0.00	...	7.11	8.28	8.66	5.33	0.00	0.13	0.00	6.01	6.16	0.08	0.00	10.00	7.14	11.43	20.00	2.86	7.41	14.81	0.00	0.00	0.00	11.11	8.47	9.45	0.00	4.60	0.43	0.00	3.77	2.70	6.84	7.28	0.00	5.60	7.52	5.93	3.89
3	1	13.30	7.16	5.69	1.98	0.71	2.20	0.22	13.89	1.75	0.00	3.27	0.00	10.26	9.46	4.42	4.14	4.08	4.32	4.22	4.11	3.83	4.08	4.36	3.99	7.96	6.19	5.31	4.42	11.50	32.14	3.57	0.00	0.00	0.58	0.13	5.83	1.45	11.60	0.00	...	4.11	4.15	4.13	4.20	4.37	4.30	4.54	4.15	4.11	4.28	0.00	9.73	4.42	3.54	7.96	4.42	7.14	3.57	0.00	0.00	0.00	3.57	2.95	7.28	0.00	9.54	5.71	0.48	4.77	1.13	2.67	8.83	0.00	4.09	4.14	4.16	4.23
4	1	5.24	5.09	5.31	1.05	0.00	4.24	0.34	5.25	5.59	9.58	3.32	0.00	5.85	10.89	2.16	6.13	5.88	4.22	5.15	5.54	5.85	4.68	1.15	1.54	9.42	7.48	2.77	3.88	6.65	13.64	3.03	4.55	4.55	0.24	0.08	8.67	8.11	4.08	0.00	...	5.93	6.01	6.05	6.07	1.64	3.17	2.25	6.44	6.60	2.91	0.28	9.14	9.42	4.99	3.60	2.49	15.15	1.52	0.00	0.00	4.55	9.09	7.24	7.63	0.00	5.11	3.71	0.00	6.01	4.16	4.49	6.14	0.00	6.18	6.10	6.06	5.86

5 rows × 121 columns

array([[5.04, 5.77, 6.04, ..., 5.37, 6.18, 4.24],
       [5.98, 2.9 , 1.94, ..., 8.32, 5.4 , 1.57],
       [7.04, 8.92, 3.24, ..., 7.52, 5.93, 3.89],
       ...,
       [2.74, 5.42, 8.79, ..., 6.75, 4.98, 5.21],
       [7.51, 6.76, 4.73, ..., 4.94, 5.  , 4.93],
       [3.81, 6.79, 7.66, ..., 5.17, 5.06, 4.72]])

# Get X and y
X = df.iloc[:, 1:].values
y = df.iloc[:, 0].values

# Split data into training and testing
from sklearn import model_selection

# Reserve 20% for testing
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.2)

print('Training data has %d observation with %d features' % X_train.shape)
print('Test data has %d observation with %d features' % X_test.shape)

Training data has 12000 observation with 120 features
Test data has 3000 observation with 120 features

!pip install catboost
from catboost import CatBoostClassifier
classifier_cat = CatBoostClassifier(iterations = 100, task_type = 'GPU')

X_train.shape

(12000, 120)

import timeit
def train_on_cpu():  
  model = CatBoostClassifier(
      iterations=1000,
      eval_metric = 'AUC',
      boosting_type = 'Plain',
      learning_rate=0.03
  )
  
  model.fit(
      X_train, y_train,
      eval_set=(X_test, y_test),
      verbose=100
  );   
      
cpu_time = timeit.timeit('train_on_cpu()', 
                         setup="from __main__ import train_on_cpu", 
                         number=1)

print('Time to fit model on CPU: {} sec'.format(int(cpu_time)))

learn: 0.6676061	test: 0.6675290	best: 0.6675290 (0)	total: 38ms	remaining: 37.9s
learn: 0.3157643	test: 0.3226858	best: 0.3226858 (100)	total: 4.22s	remaining: 37.6s
learn: 0.3003360	test: 0.3215573	best: 0.3214868 (186)	total: 8.91s	remaining: 35.4s
learn: 0.2866442	test: 0.3216649	best: 0.3214678 (221)	total: 13.5s	remaining: 31.5s
learn: 0.2729592	test: 0.3216150	best: 0.3214438 (359)	total: 18.2s	remaining: 27.1s
learn: 0.2592651	test: 0.3218496	best: 0.3214438 (359)	total: 22.8s	remaining: 22.7s
learn: 0.2460502	test: 0.3213318	best: 0.3212648 (585)	total: 27.4s	remaining: 18.2s
learn: 0.2329541	test: 0.3218720	best: 0.3212648 (585)	total: 32.1s	remaining: 13.7s
learn: 0.2210274	test: 0.3218479	best: 0.3212648 (585)	total: 36.8s	remaining: 9.15s
learn: 0.2093181	test: 0.3221360	best: 0.3212648 (585)	total: 41.6s	remaining: 4.57s
learn: 0.1988029	test: 0.3234133	best: 0.3212648 (585)	total: 46.3s	remaining: 0us

bestTest = 0.3212648222
bestIteration = 585

Shrink model to first 586 iterations.
Time to fit model on CPU: 46 sec

def train_on_gpu():  
  model = CatBoostClassifier(
      iterations=1000,
      learning_rate=0.03,
      eval_metric = 'AUC',
      boosting_type = 'Plain',
      task_type='GPU'
  )
  
  model.fit(
      X_train, y_train,
      eval_set=(X_test, y_test),
      verbose=100
  );     
      
gpu_time = timeit.timeit('train_on_gpu()', 
                         setup="from __main__ import train_on_gpu", 
                         number=1)

print('Time to fit model on GPU: {} sec'.format(int(gpu_time)))
print('GPU speedup over CPU: ' + '%.2f' % (cpu_time/gpu_time) + 'x')

0:	learn: 0.5710810	test: 0.5505786	best: 0.5505786 (0)	total: 22.9ms	remaining: 22.9s
100:	learn: 0.8201020	test: 0.5631405	best: 0.5827100 (49)	total: 2.35s	remaining: 20.9s
200:	learn: 0.8866887	test: 0.5592176	best: 0.5827100 (49)	total: 4.79s	remaining: 19.1s
300:	learn: 0.9317816	test: 0.5518306	best: 0.5827100 (49)	total: 7.2s	remaining: 16.7s
400:	learn: 0.9584097	test: 0.5486118	best: 0.5827100 (49)	total: 9.55s	remaining: 14.3s
500:	learn: 0.9745375	test: 0.5483546	best: 0.5827100 (49)	total: 12s	remaining: 12s
600:	learn: 0.9842969	test: 0.5495813	best: 0.5827100 (49)	total: 14.5s	remaining: 9.62s
700:	learn: 0.9908361	test: 0.5486887	best: 0.5827100 (49)	total: 16.8s	remaining: 7.17s
800:	learn: 0.9940755	test: 0.5510603	best: 0.5827100 (49)	total: 19.2s	remaining: 4.76s
900:	learn: 0.9961302	test: 0.5465447	best: 0.5827100 (49)	total: 21.5s	remaining: 2.36s
999:	learn: 0.9976045	test: 0.5430479	best: 0.5827100 (49)	total: 23.9s	remaining: 0us
bestTest = 0.5827099681
bestIteration = 49
Shrink model to first 50 iterations.
Time to fit model on GPU: 26 sec
GPU speedup over CPU: 1.79x

import lightgbm as lgb
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
# specify your configurations as a dict
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'learning_rate': 0.03
}

# Use time function to measure time elapsed
import time
start = time.time()


gbm = lgb.train(params,
                lgb_train,
                num_boost_round=1000,
                valid_sets=lgb_train)

end = time.time()
print(end - start)

3. Customer Satisfaction

from google.colab import files
uploaded = files.upload()

 <input type="file" id="files-e5655230-bf1b-40c8-8ab5-975d6b884627" name="files[]" multiple disabled />
 <output id="result-e5655230-bf1b-40c8-8ab5-975d6b884627">
  Upload widget is only available when the cell has been executed in the
  current browser session. Please rerun this cell to enable.
  </output>
  <script src="/nbextensions/google.colab/files.js"></script> 


Saving customer_satisfaction.csv to customer_satisfaction.csv

import io
import pandas as pd

data = pd.read_csv(io.BytesIO(uploaded['customer_satisfaction.csv']))
data.head()

	ID	var3	var15	imp_op_var39_comer_ult1	imp_op_var39_comer_ult3	imp_op_var41_comer_ult1	imp_op_var41_comer_ult3	imp_op_var41_ult1	imp_op_var39_ult1	ind_var5_0	ind_var5	ind_var12_0	ind_var12	ind_var13_0	ind_var13_corto_0	ind_var13_corto	ind_var13	...	saldo_medio_var5_ult1	saldo_medio_var5_ult3	saldo_medio_var12_ult1	saldo_medio_var12_ult3	saldo_medio_var13_corto_hace2	saldo_medio_var13_corto_hace3	saldo_medio_var13_corto_ult1	saldo_medio_var13_corto_ult3	var38
0	1	2	23	0.0	0.0	0.0	0.0	0.0	0.0	1	0	0	0	0	0	0	0	...	0.00	0.00	0.00	0.00	0.0	0.00	0.0	0.00	39205.170000
1	3	2	34	0.0	0.0	0.0	0.0	0.0	0.0	1	0	0	0	1	1	1	1	...	0.00	0.00	0.00	0.00	300.0	122.22	300.0	240.75	49278.030000
2	4	2	23	0.0	0.0	0.0	0.0	0.0	0.0	1	1	0	0	0	0	0	0	...	3.00	2.07	0.00	0.00	0.0	0.00	0.0	0.00	67333.770000
3	8	2	37	195.0	195.0	195.0	195.0	195.0	195.0	1	1	0	0	0	0	0	0	...	91.56	138.84	0.00	0.00	0.0	0.00	0.0	0.00	64007.970000
4	10	2	39	0.0	0.0	0.0	0.0	0.0	0.0	1	0	1	1	0	0	0	0	...	40501.08	13501.47	85501.89	85501.89	0.0	0.00	0.0	0.00	117310.979016

5 rows × 371 columns

# Get X and y
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

# Split data into training and testing
from sklearn import model_selection

# Reserve 20% for testing
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.2)

print('Training data has %d observation with %d features' % X_train.shape)
print('Test data has %d observation with %d features' % X_test.shape)

Training data has 60816 observation with 370 features
Test data has 15204 observation with 370 features

3.1 CatBoost

Train in CPU

from catboost import CatBoostClassifier
classifier_cat = CatBoostClassifier(iterations = 100, task_type = 'GPU')

import timeit
def train_on_cpu():  
  model = CatBoostClassifier(
      iterations=1000,
      learning_rate=0.03,
      boosting_type = 'Plain'
  )
  
  model.fit(
      X_train, y_train,
      eval_set=(X_test, y_test),
      verbose=100
  );   
      
cpu_time = timeit.timeit('train_on_cpu()', 
                         setup="from __main__ import train_on_cpu", 
                         number=1)

print('Time to fit model on CPU: {} sec'.format(int(cpu_time)))

learn: 0.6499884	test: 0.6502590	best: 0.6502590 (0)	total: 97.6ms	remaining: 1m 37s
learn: 0.1356915	test: 0.1439829	best: 0.1439829 (100)	total: 11.9s	remaining: 1m 46s
learn: 0.1312296	test: 0.1412094	best: 0.1412094 (200)	total: 23.3s	remaining: 1m 32s
learn: 0.1295643	test: 0.1405838	best: 0.1405772 (299)	total: 34.2s	remaining: 1m 19s
learn: 0.1280686	test: 0.1401968	best: 0.1401918 (399)	total: 45.2s	remaining: 1m 7s
learn: 0.1265318	test: 0.1399162	best: 0.1399123 (495)	total: 56.4s	remaining: 56.2s
learn: 0.1251861	test: 0.1397071	best: 0.1397071 (600)	total: 1m 7s	remaining: 44.8s
learn: 0.1239926	test: 0.1396878	best: 0.1396694 (695)	total: 1m 18s	remaining: 33.5s
learn: 0.1225489	test: 0.1395715	best: 0.1395640 (797)	total: 1m 30s	remaining: 22.4s
learn: 0.1212882	test: 0.1395614	best: 0.1395459 (852)	total: 1m 41s	remaining: 11.2s
learn: 0.1201106	test: 0.1395218	best: 0.1395129 (989)	total: 1m 52s	remaining: 0us

bestTest = 0.1395128933
bestIteration = 989

Shrink model to first 990 iterations.
Time to fit model on CPU: 117 sec

Train in GPU

def train_on_gpu():  
  model = CatBoostClassifier(
      iterations=1000,
      learning_rate=0.03,
      task_type='GPU'
  )
  
  model.fit(
      X_train, y_train,
      eval_set=(X_test, y_test),
      verbose=100
  );     
      
gpu_time = timeit.timeit('train_on_gpu()', 
                         setup="from __main__ import train_on_gpu", 
                         number=1)

print('Time to fit model on GPU: {} sec'.format(int(gpu_time)))
print('GPU speedup over CPU: ' + '%.2f' % (cpu_time/gpu_time) + 'x')

0:	learn: 0.6420095	test: 0.6421342	best: 0.6421342 (0)	total: 12.8ms	remaining: 12.8s
100:	learn: 0.1350072	test: 0.1434604	best: 0.1434604 (100)	total: 1.01s	remaining: 9.04s
200:	learn: 0.1307862	test: 0.1409726	best: 0.1409726 (200)	total: 1.88s	remaining: 7.46s
300:	learn: 0.1288554	test: 0.1404405	best: 0.1404258 (294)	total: 2.75s	remaining: 6.4s
400:	learn: 0.1271075	test: 0.1399712	best: 0.1399712 (400)	total: 3.63s	remaining: 5.43s
500:	learn: 0.1255655	test: 0.1397720	best: 0.1397680 (486)	total: 4.51s	remaining: 4.5s
600:	learn: 0.1243287	test: 0.1396310	best: 0.1396310 (600)	total: 5.57s	remaining: 3.7s
700:	learn: 0.1229532	test: 0.1396375	best: 0.1395841 (667)	total: 6.65s	remaining: 2.84s
800:	learn: 0.1216558	test: 0.1396176	best: 0.1395823 (772)	total: 7.75s	remaining: 1.93s
900:	learn: 0.1204271	test: 0.1396710	best: 0.1395823 (772)	total: 8.86s	remaining: 973ms
999:	learn: 0.1191901	test: 0.1397230	best: 0.1395823 (772)	total: 10s	remaining: 0us
bestTest = 0.1395823121
bestIteration = 772
Shrink model to first 773 iterations.
Time to fit model on GPU: 15 sec
GPU speedup over CPU: 7.48x

3.2 LGBM

Train in LGBM

import lightgbm as lgb
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
# specify your configurations as a dict
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'learning_rate': 0.03
}

# Use time function to measure time elapsed
import time
start = time.time()


gbm = lgb.train(params,
                lgb_train,
                num_boost_round=1000,
                valid_sets=lgb_train)

end = time.time()
print(end - start)

CatBoost Property Research Test

Explored the CatBoost property based on different dataset in both CPU and GPU environment

CatBoost Property Test

Yun Han 7/30/2019

1. Epilson

Training on CPU

Training on GPU

2. Credit card lost

3. Customer Satisfaction

3.1 CatBoost

Train in CPU

Train in GPU

3.2 LGBM

Train in LGBM

CATALOG

FEATURED TAGS