import pandas as pd
import numpy as np
import matplotlib. pyplot as plt
import seaborn as sns
from time import time
import datetime
from sklearn. model_selection import train_test_split, cross_val_score
from sklearn. feature_selection import f_classif, SelectFromModel, VarianceThreshold
from sklearn. ensemble import RandomForestClassifier as RFC
% matplotlib inline
plt. rcParams[ 'font.family' ] = [ 'SimHei' ]
plt. rcParams[ 'axes.unicode_minus' ] = False
df = pd. read_table( 'userlostprob.txt' )
df. head( )
label sampleid d arrival iforderpv_24h decisionhabit_user historyvisit_7ordernum historyvisit_totalordernum hotelcr ordercanceledprecent ... lowestprice_pre2 lasthtlordergap businessrate_pre2 cityuvs cityorders lastpvgap cr sid visitnum_oneyear h 0 0 24636 2016-05-18 2016-05-18 0 NaN NaN NaN 1.04 NaN ... 615.0 NaN 0.29 12.880 3.147 NaN NaN 7 NaN 12 1 1 24637 2016-05-18 2016-05-18 0 NaN NaN NaN 1.06 NaN ... 513.0 NaN 0.53 17.933 4.913 NaN NaN 33 NaN 14 2 0 24641 2016-05-18 2016-05-19 0 NaN NaN NaN 1.05 NaN ... 382.0 NaN 0.60 3.993 0.760 NaN NaN 10 NaN 19 3 0 24642 2016-05-18 2016-05-18 0 NaN NaN NaN 1.01 NaN ... 203.0 NaN 0.18 3.220 0.660 NaN NaN 8 NaN 16 4 1 24644 2016-05-18 2016-05-19 0 NaN NaN NaN 1.00 NaN ... 84.0 NaN NaN 0.013 NaN NaN NaN 1 NaN 21
5 rows × 51 columns
df[ 'label' ] . value_counts( )
0 500588
1 189357
Name: label, dtype: int64
df. tail( )
label sampleid d arrival iforderpv_24h decisionhabit_user historyvisit_7ordernum historyvisit_totalordernum hotelcr ordercanceledprecent ... lowestprice_pre2 lasthtlordergap businessrate_pre2 cityuvs cityorders lastpvgap cr sid visitnum_oneyear h 689940 1 2238419 2016-05-15 2016-05-17 1 19.0 NaN NaN 1.06 NaN ... 406.0 NaN 0.48 13.573 1.660 1034.0 1.0 5 119.0 18 689941 1 2238421 2016-05-15 2016-05-15 1 10.0 3.0 3.0 1.06 0.33 ... 199.0 713.0 0.51 2.880 0.513 179.0 2.0 15 1472.0 12 689942 0 2238422 2016-05-15 2016-05-17 0 NaN NaN NaN 1.07 NaN ... 544.0 NaN 0.45 15.293 2.067 0.0 NaN 8 107.0 0 689943 0 2238425 2016-05-15 2016-05-17 0 NaN NaN NaN 1.04 NaN ... 156.0 NaN 0.29 2.467 0.333 NaN NaN 4 NaN 0 689944 0 2238426 2016-05-15 2016-05-15 0 NaN NaN NaN 1.02 NaN ... 275.0 NaN NaN 12.600 2.653 NaN NaN 2 NaN 11
5 rows × 51 columns
df. sample( 5 )
label sampleid d arrival iforderpv_24h decisionhabit_user historyvisit_7ordernum historyvisit_totalordernum hotelcr ordercanceledprecent ... lowestprice_pre2 lasthtlordergap businessrate_pre2 cityuvs cityorders lastpvgap cr sid visitnum_oneyear h 477013 1 820235 2016-05-21 2016-05-21 0 15.0 NaN 15.0 1.05 0.36 ... 582.0 18831.0 0.48 17.220 3.400 4242.0 1.33 446 906.0 9 426926 0 736598 2016-05-15 2016-05-15 0 1.0 NaN 39.0 1.05 0.16 ... 978.0 12199.0 0.13 5.113 0.847 642.0 1.36 732 2583.0 8 628554 0 1072402 2016-05-20 2016-05-20 0 NaN NaN 3.0 1.02 0.00 ... 147.0 55214.0 0.27 15.873 3.220 10002.0 1.11 186 905.0 19 248275 0 438633 2016-05-18 2016-06-09 0 19.0 2.0 28.0 1.02 0.78 ... NaN 3329.0 NaN 1.320 0.087 145.0 1.12 449 17397.0 11 198972 0 356550 2016-05-19 2016-05-19 0 7.0 NaN 2.0 1.04 0.50 ... 206.0 61467.0 0.32 20.480 5.153 13264.0 1.08 59 1522.0 20
5 rows × 51 columns
df. shape
(689945, 51)
df. dtypes
label int64
sampleid int64
d object
arrival object
iforderpv_24h int64
decisionhabit_user float64
historyvisit_7ordernum float64
historyvisit_totalordernum float64
hotelcr float64
ordercanceledprecent float64
landhalfhours float64
ordercanncelednum float64
commentnums float64
starprefer float64
novoters float64
consuming_capacity float64
historyvisit_avghotelnum float64
cancelrate float64
historyvisit_visit_detailpagenum float64
delta_price1 float64
price_sensitive float64
hoteluv float64
businessrate_pre float64
ordernum_oneyear float64
cr_pre float64
avgprice float64
lowestprice float64
firstorder_bu float64
customereval_pre2 float64
delta_price2 float64
commentnums_pre float64
customer_value_profit float64
commentnums_pre2 float64
cancelrate_pre float64
novoters_pre2 float64
novoters_pre float64
ctrip_profits float64
deltaprice_pre2_t1 float64
lowestprice_pre float64
uv_pre float64
uv_pre2 float64
lowestprice_pre2 float64
lasthtlordergap float64
businessrate_pre2 float64
cityuvs float64
cityorders float64
lastpvgap float64
cr float64
sid int64
visitnum_oneyear float64
h int64
dtype: object
df. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 689945 entries, 0 to 689944
Data columns (total 51 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 label 689945 non-null int64
1 sampleid 689945 non-null int64
2 d 689945 non-null object
3 arrival 689945 non-null object
4 iforderpv_24h 689945 non-null int64
5 decisionhabit_user 385450 non-null float64
6 historyvisit_7ordernum 82915 non-null float64
7 historyvisit_totalordernum 386525 non-null float64
8 hotelcr 689148 non-null float64
9 ordercanceledprecent 447831 non-null float64
10 landhalfhours 661312 non-null float64
11 ordercanncelednum 447831 non-null float64
12 commentnums 622029 non-null float64
13 starprefer 464892 non-null float64
14 novoters 672918 non-null float64
15 consuming_capacity 463837 non-null float64
16 historyvisit_avghotelnum 387876 non-null float64
17 cancelrate 678227 non-null float64
18 historyvisit_visit_detailpagenum 307234 non-null float64
19 delta_price1 437146 non-null float64
20 price_sensitive 463837 non-null float64
21 hoteluv 689148 non-null float64
22 businessrate_pre 483896 non-null float64
23 ordernum_oneyear 447831 non-null float64
24 cr_pre 660548 non-null float64
25 avgprice 457261 non-null float64
26 lowestprice 687931 non-null float64
27 firstorder_bu 376993 non-null float64
28 customereval_pre2 661312 non-null float64
29 delta_price2 437750 non-null float64
30 commentnums_pre 598368 non-null float64
31 customer_value_profit 439123 non-null float64
32 commentnums_pre2 648457 non-null float64
33 cancelrate_pre 653015 non-null float64
34 novoters_pre2 657616 non-null float64
35 novoters_pre 648956 non-null float64
36 ctrip_profits 445187 non-null float64
37 deltaprice_pre2_t1 543180 non-null float64
38 lowestprice_pre 659689 non-null float64
39 uv_pre 660548 non-null float64
40 uv_pre2 661189 non-null float64
41 lowestprice_pre2 660664 non-null float64
42 lasthtlordergap 447831 non-null float64
43 businessrate_pre2 602960 non-null float64
44 cityuvs 682274 non-null float64
45 cityorders 651263 non-null float64
46 lastpvgap 592818 non-null float64
47 cr 457896 non-null float64
48 sid 689945 non-null int64
49 visitnum_oneyear 592910 non-null float64
50 h 689945 non-null int64
dtypes: float64(44), int64(5), object(2)
memory usage: 268.5+ MB
df. describe( [ 0.01 , 0.1 , 0.25 , 0.5 , 0.75 , 0.9 , 0.99 ] )
label sampleid iforderpv_24h decisionhabit_user historyvisit_7ordernum historyvisit_totalordernum hotelcr ordercanceledprecent landhalfhours ordercanncelednum ... lowestprice_pre2 lasthtlordergap businessrate_pre2 cityuvs cityorders lastpvgap cr sid visitnum_oneyear h count 689945.000000 6.899450e+05 689945.000000 385450.000000 82915.000000 386525.000000 689148.000000 447831.000000 661312.000000 447831.000000 ... 660664.000000 447831.000000 602960.000000 682274.000000 651263.000000 592818.000000 457896.000000 689945.000000 5.929100e+05 689945.000000 mean 0.274452 6.285402e+05 0.193737 5.317048 1.856094 11.710487 1.060996 0.342119 6.086366 154.179369 ... 318.541812 101830.919400 0.368237 10.648278 2.253250 12049.409382 1.137476 153.702414 1.855185e+04 14.462315 std 0.446238 4.146815e+05 0.395226 38.524483 2.103862 17.251429 0.045264 0.354210 12.413225 398.456986 ... 351.913035 122784.313864 0.219945 15.696682 3.538453 25601.374138 0.204789 277.807697 2.288603e+05 6.301575 min 0.000000 2.463600e+04 0.000000 0.000000 1.000000 1.000000 1.000000 0.000000 0.000000 0.000000 ... 1.000000 0.000000 0.000000 0.007000 0.007000 0.000000 1.000000 0.000000 1.000000e+00 0.000000 1% 0.000000 3.620588e+04 0.000000 1.000000 1.000000 1.000000 1.000000 0.000000 0.000000 0.000000 ... 52.000000 244.000000 0.010000 0.013000 0.007000 0.000000 1.000000 1.000000 2.100000e+01 0.000000 10% 0.000000 1.398464e+05 0.000000 1.000000 1.000000 1.000000 1.010000 0.000000 0.000000 0.000000 ... 101.000000 3518.000000 0.050000 0.160000 0.033000 127.000000 1.000000 4.000000 1.610000e+02 6.000000 25% 0.000000 3.123200e+05 0.000000 2.000000 1.000000 2.000000 1.030000 0.000000 0.000000 0.000000 ... 145.000000 14999.000000 0.170000 0.827000 0.127000 551.000000 1.000000 17.000000 4.710000e+02 11.000000 50% 0.000000 5.996370e+05 0.000000 3.000000 1.000000 6.000000 1.050000 0.250000 0.000000 2.000000 ... 233.000000 46890.000000 0.400000 3.527000 0.627000 2848.000000 1.050000 62.000000 1.315000e+03 15.000000 75% 1.000000 8.874600e+05 0.000000 5.000000 2.000000 14.000000 1.090000 0.570000 4.000000 153.000000 ... 388.000000 138953.000000 0.550000 13.327000 2.747000 10726.000000 1.210000 180.000000 3.141000e+03 20.000000 90% 1.000000 1.059705e+06 1.000000 10.000000 3.000000 29.000000 1.120000 0.980000 27.000000 492.000000 ... 611.000000 311492.000000 0.650000 35.567000 7.547000 30384.900000 1.400000 392.000000 6.634000e+03 22.000000 99% 1.000000 2.226893e+06 1.000000 27.000000 7.000000 82.000000 1.190000 1.000000 48.000000 1752.000000 ... 1464.000000 484734.000000 0.780000 66.007000 14.453000 138722.000000 2.000000 1212.000000 2.625670e+05 23.000000 max 1.000000 2.238426e+06 1.000000 3167.000000 106.000000 711.000000 3.180000 1.000000 49.000000 13475.000000 ... 43700.000000 527026.000000 0.990000 67.140000 14.507000 194386.000000 11.000000 9956.000000 9.651192e+06 23.000000
12 rows × 49 columns
df. drop_duplicates( inplace= True )
df. shape
(689945, 51)
null = df. isnull( ) . mean( ) . reset_index( ) . sort_values( 0 )
null_1 = null. rename( columns= { 'index' : '特征' , 0 : '缺失比' } )
null_1
特征 缺失比 0 label 0.000000 48 sid 0.000000 4 iforderpv_24h 0.000000 50 h 0.000000 2 d 0.000000 1 sampleid 0.000000 3 arrival 0.000000 8 hotelcr 0.001155 21 hoteluv 0.001155 26 lowestprice 0.002919 44 cityuvs 0.011118 17 cancelrate 0.016984 14 novoters 0.024679 28 customereval_pre2 0.041500 10 landhalfhours 0.041500 40 uv_pre2 0.041679 41 lowestprice_pre2 0.042440 39 uv_pre 0.042608 24 cr_pre 0.042608 38 lowestprice_pre 0.043853 34 novoters_pre2 0.046857 33 cancelrate_pre 0.053526 45 cityorders 0.056065 35 novoters_pre 0.059409 32 commentnums_pre2 0.060132 12 commentnums 0.098437 43 businessrate_pre2 0.126075 30 commentnums_pre 0.132731 49 visitnum_oneyear 0.140642 46 lastpvgap 0.140775 37 deltaprice_pre2_t1 0.212720 22 businessrate_pre 0.298646 13 starprefer 0.326190 20 price_sensitive 0.327719 15 consuming_capacity 0.327719 47 cr 0.336330 25 avgprice 0.337250 23 ordernum_oneyear 0.350918 42 lasthtlordergap 0.350918 11 ordercanncelednum 0.350918 9 ordercanceledprecent 0.350918 36 ctrip_profits 0.354750 31 customer_value_profit 0.363539 29 delta_price2 0.365529 19 delta_price1 0.366405 16 historyvisit_avghotelnum 0.437816 7 historyvisit_totalordernum 0.439774 5 decisionhabit_user 0.441332 27 firstorder_bu 0.453590 18 historyvisit_visit_detailpagenum 0.554698 6 historyvisit_7ordernum 0.879824
plt. figure( figsize= ( 8 , 6 ) )
sns. kdeplot( null_1[ '缺失比' ] , shade= True )
plt. figure( figsize= ( 8 , 6 ) )
plt. bar( range ( null_1. shape[ 0 ] ) , null_1[ '缺失比' ] , label= 'lost rate' )
plt. legend( loc= 'best' )
df = df. drop( [ 'historyvisit_7ordernum' ] , axis= 1 )
df
label sampleid d arrival iforderpv_24h decisionhabit_user historyvisit_totalordernum hotelcr ordercanceledprecent landhalfhours ... lowestprice_pre2 lasthtlordergap businessrate_pre2 cityuvs cityorders lastpvgap cr sid visitnum_oneyear h 0 0 24636 2016-05-18 2016-05-18 0 NaN NaN 1.04 NaN 22.0 ... 615.0 NaN 0.29 12.880 3.147 NaN NaN 7 NaN 12 1 1 24637 2016-05-18 2016-05-18 0 NaN NaN 1.06 NaN 0.0 ... 513.0 NaN 0.53 17.933 4.913 NaN NaN 33 NaN 14 2 0 24641 2016-05-18 2016-05-19 0 NaN NaN 1.05 NaN 3.0 ... 382.0 NaN 0.60 3.993 0.760 NaN NaN 10 NaN 19 3 0 24642 2016-05-18 2016-05-18 0 NaN NaN 1.01 NaN 2.0 ... 203.0 NaN 0.18 3.220 0.660 NaN NaN 8 NaN 16 4 1 24644 2016-05-18 2016-05-19 0 NaN NaN 1.00 NaN 0.0 ... 84.0 NaN NaN 0.013 NaN NaN NaN 1 NaN 21 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 689940 1 2238419 2016-05-15 2016-05-17 1 19.0 NaN 1.06 NaN 1.0 ... 406.0 NaN 0.48 13.573 1.660 1034.0 1.0 5 119.0 18 689941 1 2238421 2016-05-15 2016-05-15 1 10.0 3.0 1.06 0.33 49.0 ... 199.0 713.0 0.51 2.880 0.513 179.0 2.0 15 1472.0 12 689942 0 2238422 2016-05-15 2016-05-17 0 NaN NaN 1.07 NaN 0.0 ... 544.0 NaN 0.45 15.293 2.067 0.0 NaN 8 107.0 0 689943 0 2238425 2016-05-15 2016-05-17 0 NaN NaN 1.04 NaN 0.0 ... 156.0 NaN 0.29 2.467 0.333 NaN NaN 4 NaN 0 689944 0 2238426 2016-05-15 2016-05-15 0 NaN NaN 1.02 NaN 0.0 ... 275.0 NaN NaN 12.600 2.653 NaN NaN 2 NaN 11
689945 rows × 50 columns
df. describe( [ 0.01 , 0.25 , 0.5 , 0.75 , 0.99 ] ) . T
count mean std min 1% 25% 50% 75% 99% max label 689945.0 0.274452 0.446238 0.000 0.00000 0.000 0.000 1.000 1.000000e+00 1.000 sampleid 689945.0 628540.209625 414681.498697 24636.000 36205.88000 312320.000 599637.000 887460.000 2.226893e+06 2238426.000 iforderpv_24h 689945.0 0.193737 0.395226 0.000 0.00000 0.000 0.000 0.000 1.000000e+00 1.000 decisionhabit_user 385450.0 5.317048 38.524483 0.000 1.00000 2.000 3.000 5.000 2.700000e+01 3167.000 historyvisit_totalordernum 386525.0 11.710487 17.251429 1.000 1.00000 2.000 6.000 14.000 8.200000e+01 711.000 hotelcr 689148.0 1.060996 0.045264 1.000 1.00000 1.030 1.050 1.090 1.190000e+00 3.180 ordercanceledprecent 447831.0 0.342119 0.354210 0.000 0.00000 0.000 0.250 0.570 1.000000e+00 1.000 landhalfhours 661312.0 6.086366 12.413225 0.000 0.00000 0.000 0.000 4.000 4.800000e+01 49.000 ordercanncelednum 447831.0 154.179369 398.456986 0.000 0.00000 0.000 2.000 153.000 1.752000e+03 13475.000 commentnums 622029.0 1272.090888 2101.871601 0.000 1.00000 115.000 514.000 1670.000 8.796000e+03 34189.000 starprefer 464892.0 67.532304 19.175094 0.000 20.00000 53.300 69.400 80.300 1.000000e+02 100.000 novoters 672918.0 1706.247901 2811.690007 1.000 1.00000 157.000 692.000 2196.000 1.157600e+04 45455.000 consuming_capacity 463837.0 39.154140 23.240147 0.000 8.00000 22.000 33.000 51.000 1.000000e+02 100.000 historyvisit_avghotelnum 387876.0 6.510179 41.045261 0.000 1.00000 2.000 4.000 7.000 2.900000e+01 3167.000 cancelrate 678227.0 1051.604143 1509.066134 1.000 2.00000 137.000 503.000 1373.000 6.399000e+03 18930.000 historyvisit_visit_detailpagenum 307234.0 37.153603 73.402891 1.000 1.00000 6.000 18.000 44.000 2.620000e+02 6199.000 delta_price1 437146.0 79.067012 512.942824 -99879.000 -1227.55000 -31.000 81.000 226.000 1.081000e+03 5398.000 price_sensitive 463837.0 24.645863 26.685606 0.000 0.00000 5.000 16.000 33.000 1.000000e+02 100.000 hoteluv 689148.0 95.092708 169.981527 0.007 0.16700 10.427 36.180 107.747 9.641130e+02 1722.613 businessrate_pre 483896.0 0.372717 0.232791 0.000 0.01000 0.150 0.390 0.570 8.000000e-01 0.990 ordernum_oneyear 447831.0 11.642061 17.137209 1.000 1.00000 2.000 6.000 14.000 8.100000e+01 711.000 cr_pre 660548.0 1.062906 0.044588 1.000 1.00000 1.030 1.060 1.090 1.190000e+00 2.950 avgprice 457261.0 422.458701 290.853332 1.000 91.00000 232.000 350.000 524.000 1.491000e+03 6383.000 lowestprice 687931.0 318.806242 575.782415 -3.000 37.00000 116.000 200.000 380.000 1.823000e+03 100000.000 firstorder_bu 376993.0 11.697795 2.746821 1.000 3.00000 12.000 13.000 13.000 1.700000e+01 21.000 customereval_pre2 661312.0 3.048519 1.226635 0.000 0.00000 2.000 3.000 4.000 5.500000e+00 6.000 delta_price2 437750.0 77.277208 391.413839 -43344.000 -949.00000 -29.000 69.000 198.000 1.018000e+03 5114.000 commentnums_pre 598368.0 1415.159561 2329.418922 0.000 1.00000 137.000 592.000 1862.000 9.732000e+03 34189.000 customer_value_profit 439123.0 3.038409 6.625281 -24.075 -0.29678 0.269 0.991 3.138 2.845100e+01 598.064 commentnums_pre2 648457.0 1313.388737 1719.513354 0.000 3.00000 270.000 768.000 1780.000 7.457000e+03 34189.000 cancelrate_pre 653015.0 0.344422 0.179147 0.000 0.05000 0.230 0.320 0.420 1.000000e+00 1.000 novoters_pre2 657616.0 1787.197614 2316.712985 1.000 5.00000 391.000 1054.000 2413.000 1.001800e+04 45436.000 novoters_pre 648956.0 1890.698450 3116.120062 1.000 2.00000 187.000 783.000 2453.000 1.383900e+04 45436.000 ctrip_profits 445187.0 4.208495 9.314438 -44.313 -0.39300 0.340 1.347 4.320 4.075580e+01 600.820 deltaprice_pre2_t1 543180.0 3.283740 48.805880 -2296.000 -103.00000 -3.000 2.000 10.000 1.110000e+02 3324.000 lowestprice_pre 659689.0 315.954583 463.723643 1.000 38.00000 118.000 208.000 385.000 1.750000e+03 100000.000 uv_pre 660548.0 107.846076 186.731907 0.007 0.24000 12.533 42.500 124.707 1.047787e+03 1722.613 uv_pre2 661189.0 103.352990 157.117863 0.007 0.50000 17.563 51.287 126.200 8.567254e+02 1722.613 lowestprice_pre2 660664.0 318.541812 351.913035 1.000 52.00000 145.000 233.000 388.000 1.464000e+03 43700.000 lasthtlordergap 447831.0 101830.919400 122784.313864 0.000 244.00000 14999.000 46890.000 138953.000 4.847340e+05 527026.000 businessrate_pre2 602960.0 0.368237 0.219945 0.000 0.01000 0.170 0.400 0.550 7.800000e-01 0.990 cityuvs 682274.0 10.648278 15.696682 0.007 0.01300 0.827 3.527 13.327 6.600700e+01 67.140 cityorders 651263.0 2.253250 3.538453 0.007 0.00700 0.127 0.627 2.747 1.445300e+01 14.507 lastpvgap 592818.0 12049.409382 25601.374138 0.000 0.00000 551.000 2848.000 10726.000 1.387220e+05 194386.000 cr 457896.0 1.137476 0.204789 1.000 1.00000 1.000 1.050 1.210 2.000000e+00 11.000 sid 689945.0 153.702414 277.807697 0.000 1.00000 17.000 62.000 180.000 1.212000e+03 9956.000 visitnum_oneyear 592910.0 18551.846682 228860.311117 1.000 21.00000 471.000 1315.000 3141.000 2.625670e+05 9651192.000 h 689945.0 14.462315 6.301575 0.000 0.00000 11.000 15.000 20.000 2.300000e+01 23.000
df[ [ 'lowestprice_pre' , 'lowestprice' ] ] . describe( [ 0.01 , 0.25 , 0.5 , 0.75 , 0.99 ] ) . T
count mean std min 1% 25% 50% 75% 99% max lowestprice_pre 659689.0 315.954583 463.723643 1.0 38.0 118.0 208.0 385.0 1750.0 100000.0 lowestprice 687931.0 318.806242 575.782415 -3.0 37.0 116.0 200.0 380.0 1823.0 100000.0
col_block = [ 'lowestprice_pre' , 'lowestprice' ]
def block_upper ( x) :
upper = x. quantile( 0.99 )
out = x. mask( x > upper, upper)
return out
def block_lower ( x) :
lower = x. quantile( 0.01 )
out = x. mask( x < lower, lower)
return out
df[ col_block] = df[ col_block] . apply ( block_upper)
df[ col_block] = df[ col_block] . apply ( block_lower)
df[ [ 'lowestprice_pre' , 'lowestprice' ] ] . describe( [ 0.01 , 0.25 , 0.5 , 0.75 , 0.99 ] ) . T
count mean std min 1% 25% 50% 75% 99% max lowestprice_pre 659689.0 304.439507 287.192512 38.0 38.0 118.0 208.0 385.0 1750.0 1750.0 lowestprice 687931.0 305.025771 297.382838 37.0 37.0 116.0 200.0 380.0 1823.0 1823.0
df_copy = df. copy( deep= True )
X = df_copy. iloc[ : , 2 : ]
y = df_copy. iloc[ : , 0 ]
X. head( 10 )
d arrival iforderpv_24h decisionhabit_user historyvisit_totalordernum hotelcr ordercanceledprecent landhalfhours ordercanncelednum commentnums ... lowestprice_pre2 lasthtlordergap businessrate_pre2 cityuvs cityorders lastpvgap cr sid visitnum_oneyear h 0 2016-05-18 2016-05-18 0 NaN NaN 1.04 NaN 22.0 NaN 1089.0 ... 615.0 NaN 0.29 12.880 3.147 NaN NaN 7 NaN 12 1 2016-05-18 2016-05-18 0 NaN NaN 1.06 NaN 0.0 NaN 5612.0 ... 513.0 NaN 0.53 17.933 4.913 NaN NaN 33 NaN 14 2 2016-05-18 2016-05-19 0 NaN NaN 1.05 NaN 3.0 NaN 256.0 ... 382.0 NaN 0.60 3.993 0.760 NaN NaN 10 NaN 19 3 2016-05-18 2016-05-18 0 NaN NaN 1.01 NaN 2.0 NaN NaN ... 203.0 NaN 0.18 3.220 0.660 NaN NaN 8 NaN 16 4 2016-05-18 2016-05-19 0 NaN NaN 1.00 NaN 0.0 NaN NaN ... 84.0 NaN NaN 0.013 NaN NaN NaN 1 NaN 21 5 2016-05-18 2016-05-20 0 NaN NaN 1.02 NaN 0.0 NaN 15.0 ... 408.0 NaN NaN 2.880 0.427 NaN NaN 1 NaN 21 6 2016-05-18 2016-05-25 0 NaN NaN 1.12 NaN 0.0 NaN 2578.0 ... 145.0 NaN NaN 4.427 0.493 NaN NaN 1 NaN 22 7 2016-05-18 2016-05-20 0 3.0 21.0 1.11 0.79 0.0 395.0 NaN ... 204.0 10475.0 0.53 12.713 1.987 7566.0 1.5 23 1265.0 17 8 2016-05-18 2016-05-19 0 13.0 NaN 1.08 NaN 0.0 NaN 2572.0 ... 99.0 NaN 0.41 5.393 0.860 15.0 1.0 20 596.0 20 9 2016-05-18 2016-06-08 1 2.0 7.0 1.07 0.86 47.0 6.0 NaN ... 191.0 18873.0 0.52 3.093 0.287 288.0 1.0 31 21926.0 7
10 rows × 48 columns
X_train, X_test, y_train, y_test = train_test_split( X, y, random_state= 42 )
col_date = [ 'd' , 'arrival' ]
X_train. drop( col_date, axis= 1 , inplace= True )
X_train. shape
(517458, 46)
col = X_train. columns. tolist( )
col_no = [ 'sid' , 'iforderpv_24h' , 'h' ]
col_clf = [ 'decisionhabit_user' ]
col_neg = [ 'delta_price1' , 'delta_price2' , 'customer_value_profit' , 'ctrip_profits' , 'deltaprice_pre2_t1' ]
col_35 = [ 'ordernum_oneyear' , 'lasthtlordergap' , 'ordercanncelednum' ,
'ordercanceledprecent' , 'ctrip_profits' , 'historyvisit_avghotelnum' , 'historyvisit_totalordernum' ,
'decisionhabit_user' , 'firstorder_bu' , 'historyvisit_visit_detailpagenum' ]
col_std = X_train. columns[ X_train. describe( include= 'all' ) . T[ 'std' ] > 100 ] . to_list( )
col_std. remove( 'sid' )
col_std. remove( 'delta_price2' )
col_std. remove( 'delta_price1' )
col_std. remove( 'lasthtlordergap' )
col_norm = list ( set ( col) - set ( col_no + col_clf + col_neg + col_35) )
X_train[ col_clf] = X_train[ col_clf] . fillna( X_train[ col_clf] . mode( ) )
X_train[ col_neg] = X_train[ col_neg] . fillna( X_train[ col_neg] . median( ) )
X_train[ col_35] = X_train[ col_35] . fillna( - 1 )
X_train[ col_std] = X_train[ col_std] . fillna( X_train[ col_std] . median( ) )
X_train[ col_norm] = X_train[ col_norm] . fillna( X_train[ col_norm] . mean( ) )
X_test[ col_clf] = X_test[ col_clf] . fillna( X_test[ col_clf] . mode( ) )
X_test[ col_neg] = X_test[ col_neg] . fillna( X_test[ col_neg] . median( ) )
X_test[ col_35] = X_test[ col_35] . fillna( - 1 )
X_test[ col_std] = X_test[ col_std] . fillna( X_test[ col_std] . median( ) )
X_test[ col_norm] = X_test[ col_norm] . fillna( X_test[ col_norm] . mean( ) )
X_train. isnull( ) . any ( ) . sum ( )
0
X_test. isnull( ) . any ( ) . sum ( )
0
X_train. shape
(517458, 46)
X_test. shape
(172487, 48)
selector = VarianceThreshold( )
X_train_var = selector. fit_transform( X_train)
X_train_var. shape
(517458, 46)
f, p_values = f_classif( X_train, y_train)
( p_values > 0.01 ) . sum ( )
6
col_f = X_train. columns[ p_values <= 0.01 ]
X_train = X_train[ col_f]
X_train. shape
(517458, 40)
X_test = X_test[ col_f]
X_test. shape
(172487, 40)
X_train = X_train. reset_index( drop= True )
X_test = X_test. reset_index( drop= True )
y_train = y_train. reset_index( drop= True )
y_test = y_test. reset_index( drop= True )
rfc = RFC( n_estimators= 10 , random_state= 42 )
importances = rfc. fit( X_train, y_train) . feature_importances_
importances
array([0.0116028 , 0.01896167, 0.01802851, 0.0150776 , 0.01591757,
0.02015549, 0.02034934, 0.0203898 , 0.01943307, 0.0211841 ,
0.01791408, 0.02056686, 0.02291317, 0.02291462, 0.02051174,
0.02323111, 0.02106971, 0.01054197, 0.02306534, 0.01938803,
0.0258305 , 0.02571515, 0.02508617, 0.02485002, 0.02534584,
0.02850795, 0.02408976, 0.02625239, 0.0279376 , 0.02743348,
0.02772216, 0.03058076, 0.02754623, 0.04068007, 0.03904556,
0.03655198, 0.03553142, 0.03861745, 0.04116244, 0.03829647])
scores = [ ]
thresholds = np. linspace( 0 , importances. max ( ) , 20 )
for i in thresholds:
time0 = time( )
X_embedded = SelectFromModel( rfc, threshold= i) . fit_transform( X_train, y_train)
score = cross_val_score( rfc, X_embedded, y_train, cv= 5 , n_jobs= - 1 ) . mean( )
scores. append( score)
print ( datetime. datetime. fromtimestamp( time( ) - time0) . strftime( '%M:%S:%f' ) )
plt. plot( thresholds, scores)
plt. show( )
01:12:090613
01:13:526636
01:09:249811
01:08:676264
01:07:224143
01:06:510684
01:14:200829
01:13:976302
01:13:173232
01:07:879761
01:05:081422
01:03:703478
00:57:668075
00:55:913328
00:49:275084
00:47:250707
00:48:596243
00:53:333874
00:45:020932
00:53:343730
max ( scores)
0.9507844100831351
thresholds[ scores. index( max ( scores) ) ]
0.028163774952387383
col_k = X_train. columns[ importances > 0.028163774952387383 ] . to_list( )
X_train_embedded = X_train[ col_k]
X_train_embedded. head( )
ctrip_profits lasthtlordergap cityuvs cityorders lastpvgap cr sid visitnum_oneyear h 0 1.347 -1.0 3.787 0.387 2850.0 1.137405 3 1314.0 6 1 1.347 -1.0 0.127 0.007 2850.0 1.137405 7 1314.0 13 2 0.767 -1.0 18.973 3.600 7272.0 1.137405 457 348.0 12 3 15.433 1986.0 1.507 0.287 47.0 1.160000 430 20273.0 8 4 1.347 -1.0 1.433 0.167 20.0 1.000000 85 83.0 13
X_test_embedded = X_test[ col_k]
X_test_embedded. head( )
ctrip_profits lasthtlordergap cityuvs cityorders lastpvgap cr sid visitnum_oneyear h 0 3.940 7224.0 7.147 0.580000 539.0 1.150000 220 4542.0 4 1 1.347 40911.0 0.447 0.060000 3.0 1.137687 81 3156.0 3 2 0.887 -1.0 4.313 0.460000 6532.0 1.000000 81 1026.0 17 3 1.347 -1.0 0.460 0.053000 363.0 1.000000 27 349.0 22 4 1.540 82256.0 0.060 2.246314 41.0 1.170000 63 811.0 10
plt. figure( figsize= ( 10 , 8 ) )
sns. heatmap( X_train_embedded. corr( ) , annot= True , linewidths= 1 )
X_train_embedded. drop( 'cityuvs' , axis= 1 , inplace= True )
X_test_embedded. drop( 'cityuvs' , axis= 1 , inplace= True )
X_train_embedded. to_csv( 'X_train_embedded.csv' )
X_test_embedded. to_csv( 'X_test_embedded.csv' )
y_train. to_csv( 'y_train.csv' )
y_test. to_csv( 'y_test.csv' )
X_train_embedded = pd. read_csv( 'X_train_embedded.csv' , index_col= 0 )
X_test_embedded = pd. read_csv( 'X_test_embedded.csv' , index_col= 0 )
y_train = pd. read_csv( 'y_train.csv' , index_col= 0 )
y_test = pd. read_csv( 'y_test.csv' , index_col= 0 )
y_train = np. ravel( y_train)
y_train. shape
y_test = np. ravel( y_test)
y_test. shape
(172487,)
import scipy
woe_data = pd. concat( [ X_train_embedded, pd. Series( y_train, name= 'label' ) ] , axis= 1 )
woe_data
ctrip_profits lasthtlordergap cityorders lastpvgap cr sid visitnum_oneyear h label 0 1.347 -1.0 0.387 2850.0 1.137405 3 1314.0 6 0 1 1.347 -1.0 0.007 2850.0 1.137405 7 1314.0 13 0 2 0.767 -1.0 3.600 7272.0 1.137405 457 348.0 12 1 3 15.433 1986.0 0.287 47.0 1.160000 430 20273.0 8 1 4 1.347 -1.0 0.167 20.0 1.000000 85 83.0 13 0 ... ... ... ... ... ... ... ... ... ... 517453 1.347 -1.0 0.113 4347.0 1.137405 8 278.0 21 0 517454 1.347 41045.0 0.520 2972.0 1.330000 25 1095.0 0 0 517455 1.347 113046.0 0.093 522.0 1.137405 120 6309.0 16 0 517456 -0.067 266544.0 0.600 28378.0 1.000000 22 100.0 9 0 517457 1.347 -1.0 0.420 2850.0 1.137405 5 1314.0 17 0
517458 rows × 9 columns
def get_woe ( num_bins) :
columns = [ 'min' , 'max' , 'count_0' , 'count_1' ]
df = pd. DataFrame( num_bins, columns= columns)
df[ 'total' ] = df[ 'count_0' ] + df[ 'count_1' ]
df[ 'percentage' ] = df[ 'total' ] / df[ 'total' ] . sum ( )
df[ 'bad_rate' ] = df[ 'count_1' ] / df[ 'total' ]
df[ 'good%' ] = df[ 'count_0' ] / df[ 'count_0' ] . sum ( )
df[ 'bad%' ] = df[ 'count_1' ] / df[ 'count_1' ] . sum ( )
df[ 'good-bad' ] = df[ 'good%' ] - df[ 'bad%' ]
df[ 'woe' ] = np. log( df[ 'good%' ] / df[ 'bad%' ] )
return df
def get_iv ( df) :
iv = np. sum ( df[ 'good-bad' ] * df[ 'woe' ] )
return iv
def get_bin ( X, q) :
df = woe_data. copy( )
df[ 'qcut' ] , updown = pd. qcut( df[ X] , retbins= True , q= q, duplicates= 'drop' )
count_0 = df[ df[ 'label' ] == 0 ] . groupby( 'qcut' ) . count( ) [ 'label' ]
count_1 = df[ df[ 'label' ] == 1 ] . groupby( 'qcut' ) . count( ) [ 'label' ]
num_bins = [ * zip ( updown, updown[ 1 : ] , count_0, count_1) ]
woe_df = get_woe( num_bins)
return woe_df
def get_graph ( X, n= 2 , q= 20 ) :
df = woe_data. copy( )
df[ 'qcut' ] , updown = pd. qcut( df[ X] , retbins= True , q= q, duplicates= 'drop' )
count_0 = df[ df[ 'label' ] == 0 ] . groupby( 'qcut' ) . count( ) [ 'label' ]
count_1 = df[ df[ 'label' ] == 1 ] . groupby( 'qcut' ) . count( ) [ 'label' ]
num_bins = [ * zip ( updown, updown[ 1 : ] , count_0, count_1) ]
IV = [ ]
axisx = [ ]
while len ( num_bins) > n:
pvs = [ ]
for i in range ( len ( num_bins) - 1 ) :
x1 = num_bins[ i] [ 2 : ]
x2 = num_bins[ i+ 1 ] [ 2 : ]
pv = scipy. stats. chi2_contingency( [ x1, x2] ) [ 1 ]
pvs. append( pv)
i = pvs. index( max ( pvs) )
num_bins[ i: i+ 2 ] = [ ( num_bins[ i] [ 0 ] , num_bins[ i+ 1 ] [ 1 ] ,
num_bins[ i] [ 2 ] + num_bins[ i+ 1 ] [ 2 ] ,
num_bins[ i] [ 3 ] + num_bins[ i+ 1 ] [ 3 ] ) ]
woe_df = get_woe( num_bins)
axisx. append( len ( num_bins) )
IV. append( get_iv( woe_df) )
plt. figure( )
plt. plot( axisx, IV)
plt. xticks( axisx)
plt. xlabel( "number of box" )
plt. ylabel( "IV" )
plt. show( )
col_woe = [ 'ctrip_profits' , 'lasthtlordergap' , 'cityorders' ,
'lastpvgap' , 'cr' , 'sid' , 'visitnum_oneyear' , 'h' ]
for i in col_woe:
print ( i)
get_graph( i)
ctrip_profits
lasthtlordergap
cityorders
lastpvgap
cr
sid
visitnum_oneyear
h
get_bin( 'lasthtlordergap' , 10 )
min max count_0 count_1 total percentage bad_rate good% bad% good-bad woe 0 -1.0 2356.0 158912 48086 206998 0.400029 0.232302 0.423198 0.338741 0.084457 0.222603 1 2356.0 13291.0 30983 20754 51737 0.099983 0.401144 0.082511 0.146201 -0.063691 -0.572057 2 13291.0 29219.0 34175 17574 51749 0.100006 0.339601 0.091011 0.123800 -0.032789 -0.307683 3 29219.0 56455.0 35511 16229 51740 0.099989 0.313664 0.094569 0.114325 -0.019756 -0.189714 4 56455.0 110984.0 37245 14499 51744 0.099997 0.280206 0.099187 0.102138 -0.002951 -0.029318 5 110984.0 232020.0 39177 12572 51749 0.100006 0.242942 0.104332 0.088563 0.015769 0.163861 6 232020.0 527026.0 39500 12241 51741 0.099991 0.236582 0.105192 0.086232 0.018961 0.198753
get_bin( 'cr' , 10 )
min max count_0 count_1 total percentage bad_rate good% bad% good-bad woe 0 1.000000 1.120000 166695 42471 209166 0.404218 0.203049 0.443925 0.299186 0.144738 0.394588 1 1.120000 1.137405 135099 47248 182347 0.352390 0.259110 0.359781 0.332838 0.026944 0.077841 2 1.137405 1.170000 15752 7000 22752 0.043969 0.307665 0.041949 0.049311 -0.007362 -0.161699 3 1.170000 1.330000 37674 24461 62135 0.120077 0.393675 0.100329 0.172315 -0.071986 -0.540866 4 1.330000 11.000000 20283 20775 41058 0.079346 0.505992 0.054016 0.146349 -0.092334 -0.996724
get_bin( 'cityorders' , 10 )
min max count_0 count_1 total percentage bad_rate good% bad% good-bad woe 0 0.007000 0.033000 44555 11516 56071 0.108359 0.205382 0.118654 0.081124 0.037530 0.380231 1 0.033000 0.093000 39744 10584 50328 0.097260 0.210300 0.105842 0.074559 0.031283 0.350359 2 0.093000 0.200000 40294 11095 51389 0.099310 0.215902 0.107307 0.078159 0.029148 0.316952 3 0.200000 0.380000 37656 11761 49417 0.095500 0.237995 0.100281 0.082850 0.017431 0.190947 4 0.380000 0.753000 37170 14675 51845 0.100192 0.283055 0.098987 0.103378 -0.004391 -0.043400 5 0.753000 1.400000 35492 16268 51760 0.100027 0.314297 0.094519 0.114600 -0.020081 -0.192649 6 1.400000 2.255565 49463 16530 65993 0.127533 0.250481 0.131725 0.116445 0.015279 0.123292 7 2.255565 3.260000 25531 11738 37269 0.072023 0.314953 0.067991 0.082688 -0.014697 -0.195694 8 3.260000 6.633000 32872 18901 51773 0.100053 0.365074 0.087541 0.133148 -0.045607 -0.419350 9 6.633000 14.507000 32726 18887 51613 0.099743 0.365935 0.087152 0.133049 -0.045897 -0.423060
get_bin( 'h' , 10 )
min max count_0 count_1 total percentage bad_rate good% bad% good-bad woe 0 0.0 6.0 42287 15678 57965 0.112019 0.270474 0.112614 0.110443 0.002171 0.019465 1 6.0 10.0 46850 22957 69807 0.134904 0.328864 0.124766 0.161720 -0.036954 -0.259428 2 10.0 12.0 34400 16455 50855 0.098279 0.323567 0.091610 0.115917 -0.024307 -0.235329 3 12.0 13.0 19815 8752 28567 0.055206 0.306367 0.052769 0.061653 -0.008884 -0.155599 4 13.0 15.0 38660 18137 56797 0.109762 0.319330 0.102955 0.127766 -0.024811 -0.215905 5 15.0 17.0 42537 18293 60830 0.117555 0.300723 0.113280 0.128865 -0.015585 -0.128901 6 17.0 19.0 40305 16122 56427 0.109047 0.285714 0.107336 0.113571 -0.006235 -0.056466 7 19.0 21.0 49680 15527 65207 0.126014 0.238119 0.132303 0.109380 0.022923 0.190266 8 21.0 22.0 31405 6631 38036 0.073505 0.174335 0.083634 0.046712 0.036922 0.582455 9 22.0 23.0 29564 3403 32967 0.063710 0.103224 0.078732 0.023972 0.054759 1.189144
get_bin( 'ctrip_profits' , 10 )
min max count_0 count_1 total percentage bad_rate good% bad% good-bad woe 0 -44.313 0.147 37921 13979 51900 0.100298 0.269345 0.100987 0.098475 0.002512 0.025192 1 0.147 0.500 37673 13923 51596 0.099711 0.269846 0.100327 0.098080 0.002246 0.022645 2 0.500 1.147 37713 14400 52113 0.100710 0.276323 0.100433 0.101441 -0.001007 -0.009980 3 1.147 1.347 150615 44404 195019 0.376879 0.227691 0.401102 0.312803 0.088299 0.248641 4 1.347 1.587 8296 3333 11629 0.022473 0.286611 0.022093 0.023479 -0.001386 -0.060856 5 1.587 3.220 36089 15701 51790 0.100085 0.303167 0.096108 0.110605 -0.014497 -0.140493 6 3.220 7.327 35310 16403 51713 0.099937 0.317193 0.094034 0.115551 -0.021517 -0.206054 7 7.327 600.820 31886 19812 51698 0.099908 0.383226 0.084915 0.139565 -0.054650 -0.496877
X_train_woe = X_train_embedded[ [ 'ctrip_profits' , 'lasthtlordergap' , 'cityorders' , 'cr' , 'h' ] ]
X_test_woe = X_test_embedded[ [ 'ctrip_profits' , 'lasthtlordergap' , 'cityorders' , 'cr' , 'h' ] ]
scores = [ ]
time0 = time( )
for i in np. arange( 5 , 21 , 1 ) :
rfc = RFC( n_estimators= 10 , max_depth= i, random_state= 42 )
score = cross_val_score( rfc, X_train_embedded, y_train, cv= 5 , n_jobs= - 1 ) . mean( )
scores. append( score)
print ( '花费时间:{}' . format ( datetime. datetime. fromtimestamp( time( ) - time0) . strftime( '%M:%S:%f' ) ) )
print ( '最大分数为{},最大深度为{}' . format ( max ( scores) , np. arange( 5 , 21 , 1 ) [ np. argmax( scores) ] ) )
plt. figure( figsize= ( 10 , 8 ) )
plt. plot( np. arange( 5 , 21 , 1 ) , scores)
plt. show( )
花费时间:03:27:049065
最大分数为0.8891446300224614,最大深度为20
scores = [ ]
time0 = time( )
for i in np. arange( 2 , 10 , 1 ) :
rfc = RFC( n_estimators= 10 , max_depth= 20 , min_samples_split= i, random_state= 42 )
score = cross_val_score( rfc, X_train_embedded, y_train, cv= 5 , n_jobs= - 1 ) . mean( )
scores. append( score)
print ( '花费时间:{}' . format ( datetime. datetime. fromtimestamp( time( ) - time0) . strftime( '%M:%S:%f' ) ) )
print ( '最大分数为{},最小分割数为{}' . format ( max ( scores) , np. arange( 2 , 10 , 1 ) [ np. argmax( scores) ] ) )
plt. figure( figsize= ( 10 , 8 ) )
plt. plot( np. arange( 2 , 10 , 1 ) , scores)
plt. show( )
花费时间:02:16:546873
最大分数为0.8891446300224614,最小分割数为2
scores = [ ]
time0 = time( )
for i in np. arange( 1 , 10 , 1 ) :
rfc = RFC( n_estimators= 10 , max_depth= 20 , min_samples_split= 2 , min_samples_leaf= i, random_state= 42 )
score = cross_val_score( rfc, X_train_embedded, y_train, cv= 5 , n_jobs= - 1 ) . mean( )
scores. append( score)
print ( '花费时间:{}' . format ( datetime. datetime. fromtimestamp( time( ) - time0) . strftime( '%M:%S:%f' ) ) )
print ( '最大分数为{},小叶子节点样本数{}' . format ( max ( scores) , np. arange( 1 , 10 , 1 ) [ np. argmax( scores) ] ) )
plt. figure( figsize= ( 10 , 8 ) )
plt. plot( np. arange( 1 , 10 , 1 ) , scores)
plt. show( )
花费时间:02:13:264602
最大分数为0.8891446300224614,小叶子节点样本数1
scores = [ ]
time0 = time( )
for i in np. arange( 10 , 201 , 10 ) :
rfc = RFC( n_estimators= i, max_depth= 20 , random_state= 42 )
score = cross_val_score( rfc, X_train_embedded, y_train, cv= 3 , n_jobs= - 1 ) . mean( )
scores. append( score)
print ( '花费时间:{}' . format ( datetime. datetime. fromtimestamp( time( ) - time0) . strftime( '%M:%S:%f' ) ) )
print ( '最大分数为{},树数量{}' . format ( max ( scores) , np. arange( 10 , 201 , 10 ) [ np. argmax( scores) ] ) )
plt. figure( figsize= ( 10 , 8 ) )
plt. plot( np. arange( 10 , 201 , 10 ) , scores)
plt. show( )
花费时间:00:33:316156
花费时间:01:39:789986
花费时间:03:21:423289
花费时间:05:40:185622
花费时间:08:30:948403
花费时间:11:56:105026
花费时间:15:51:769696
花费时间:20:36:914026
花费时间:25:38:212320
花费时间:31:10:214501
花费时间:37:27:759795
花费时间:44:10:525391
花费时间:51:28:059915
花费时间:59:16:547367
花费时间:07:41:280818
花费时间:16:38:253700
花费时间:26:07:070954
花费时间:36:06:570602
花费时间:48:40:747174
花费时间:00:14:880475
最大分数为0.8988864801364768,树数量100
rfc = RFC( n_estimators= 100 , max_depth= 20 , random_state= 42 ) . fit( X_train_embedded, y_train)
print ( '训练集得分为{}' . format ( rfc. score( X_train_embedded, y_train) ) )
print ( '测试集得分为{}' . format ( rfc. score( X_train_embedded, y_test) ) )
训练集得分为0.9144162424776504
测试集得分为0.8858812548192038
rfc. feature_importances_
array([0.12193391, 0.12869867, 0.14163503, 0.13799971, 0.10331983,
0.12834216, 0.14547079, 0.09259991])
y_scores = rfc. predict_proba( X_train_embedded)
y_scores
array([[0.42810513, 0.57189487],
[0.90260411, 0.09739589],
[0.87430575, 0.12569425],
...,
[0.3633656 , 0.6366344 ],
[0.79112542, 0.20887458],
[0.14388709, 0.85611291]])
from sklearn. metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve( y_test, y_scores[ : , 1 ] )
roc_auc = auc( fpr, tpr)
roc_auc
0.9680887508287116
def draw_roc ( roc_auc, fpr, tpr) :
plt. subplots( figsize= ( 7 , 5.5 ) )
plt. plot( fpr, tpr, color= 'orange' , label= 'roc curve(area={})' . format ( roc_auc) )
plt. plot( [ 0 , 1 ] , [ 0 , 1 ] , color= 'blue' , linestyle= '--' )
plt. xlabel( 'fpr' )
plt. ylabel( 'tpr' )
plt. xlim( [ 0 , 1 ] )
plt. ylim( [ 0 , 1.05 ] )
plt. title( 'ROC Curve' )
plt. legend( loc= 4 )
plt. show( )
draw_roc( roc_auc, fpr, tpr)
rfm = df[ [ 'sampleid' , 'ordernum_oneyear' , 'avgprice' , 'lasthtlordergap' ] ]
rfm. head( )
sampleid ordernum_oneyear avgprice lasthtlordergap 0 24636 NaN NaN NaN 1 24637 NaN NaN NaN 2 24641 NaN NaN NaN 3 24642 NaN NaN NaN 4 24644 NaN NaN NaN
rfm = rfm. dropna( ) . reset_index( drop= True ) . rename( columns= { 'ordernum_oneyear' : 'F' , 'avgprice' : 'M' , 'lasthtlordergap' : 'R' } )
rfm. head( )
sampleid F M R 0 24650 21.0 363.0 10475.0 1 24653 7.0 307.0 18873.0 2 24655 1.0 343.0 32071.0 3 24658 33.0 1000.0 4616.0 4 24662 4.0 685.0 44830.0
rfm[ 'R' ] = round ( rfm[ 'R' ] / 1440 , 0 )
rfm. head( )
sampleid F M R 0 24650 21.0 363.0 7.0 1 24653 7.0 307.0 13.0 2 24655 1.0 343.0 22.0 3 24658 33.0 1000.0 3.0 4 24662 4.0 685.0 31.0
rfm. describe( ) . T
count mean std min 25% 50% 75% max sampleid 426425.0 629380.138599 414760.183032 24650.0 313549.0 600907.0 887813.0 2238403.0 F 426425.0 12.137916 17.405419 1.0 3.0 6.0 14.0 711.0 M 426425.0 421.604962 286.987700 1.0 233.0 351.0 523.0 6383.0 R 426425.0 70.742163 84.844780 0.0 11.0 33.0 97.0 366.0
f_bins = [ - 1 , 3 , 5 , 7 , 10 , 720 ]
m_bins = [ - 1 , 200 , 400 , 600 , 800 , 7000 ]
r_bins = [ - 1 , 3 , 7 , 30 , 180 , 370 ]
rfm[ 'R_score' ] = pd. cut( rfm[ 'R' ] , bins= r_bins, labels= [ 5 , 4 , 3 , 2 , 1 ] ) . astype( 'int' )
rfm[ 'F_score' ] = pd. cut( rfm[ 'F' ] , bins= f_bins, labels= [ 1 , 2 , 3 , 4 , 5 ] ) . astype( 'int' )
rfm[ 'M_score' ] = pd. cut( rfm[ 'M' ] , bins= m_bins, labels= [ 1 , 2 , 3 , 4 , 5 ] ) . astype( 'int' )
rfm
sampleid F M R R_score F_score M_score 0 24650 21.0 363.0 7.0 4 5 2 1 24653 7.0 307.0 13.0 3 3 2 2 24655 1.0 343.0 22.0 3 1 2 3 24658 33.0 1000.0 3.0 5 5 5 4 24662 4.0 685.0 31.0 2 2 4 ... ... ... ... ... ... ... ... 426420 2238388 2.0 226.0 119.0 2 1 2 426421 2238389 4.0 461.0 0.0 5 2 3 426422 2238396 5.0 193.0 44.0 2 2 1 426423 2238397 1.0 258.0 87.0 2 1 2 426424 2238403 3.0 256.0 52.0 2 1 2
426425 rows × 7 columns
rfm[ 'R_level' ] = ( rfm[ 'R_score' ] > rfm[ 'R_score' ] . mean( ) ) * 1
rfm[ 'F_level' ] = ( rfm[ 'F_score' ] > rfm[ 'F_score' ] . mean( ) ) * 1
rfm[ 'M_level' ] = ( rfm[ 'M_score' ] > rfm[ 'M_score' ] . mean( ) ) * 1
rfm
sampleid F M R R_score F_score M_score R_level F_level M_level 0 24650 21.0 363.0 7.0 4 5 2 1 1 0 1 24653 7.0 307.0 13.0 3 3 2 1 1 0 2 24655 1.0 343.0 22.0 3 1 2 1 0 0 3 24658 33.0 1000.0 3.0 5 5 5 1 1 1 4 24662 4.0 685.0 31.0 2 2 4 0 0 1 ... ... ... ... ... ... ... ... ... ... ... 426420 2238388 2.0 226.0 119.0 2 1 2 0 0 0 426421 2238389 4.0 461.0 0.0 5 2 3 1 0 1 426422 2238396 5.0 193.0 44.0 2 2 1 0 0 0 426423 2238397 1.0 258.0 87.0 2 1 2 0 0 0 426424 2238403 3.0 256.0 52.0 2 1 2 0 0 0
426425 rows × 10 columns
rfm[ 'RFM' ] = pd. concat( [ rfm[ 'R_level' ] . astype( 'str' ) + rfm[ 'F_level' ] . astype( 'str' ) + rfm[ 'M_level' ] . astype( 'str' ) ] )
rfm[ 'RFM' ] . replace( [ '111' , '101' , '011' , '001' , '110' , '100' , '010' , '000' ]
, [ '重要价值用户' , '重要发展用户' , '重要保持用户' , '重要挽留用户' , '一般价值用户' , '一般发展用户' , '一般保持用户' , '一般挽留用户' ] , inplace= True )
rfm
sampleid F M R R_score F_score M_score R_level F_level M_level RFM 0 24650 21.0 363.0 7.0 4 5 2 1 1 0 一般价值用户 1 24653 7.0 307.0 13.0 3 3 2 1 1 0 一般价值用户 2 24655 1.0 343.0 22.0 3 1 2 1 0 0 一般发展用户 3 24658 33.0 1000.0 3.0 5 5 5 1 1 1 重要价值用户 4 24662 4.0 685.0 31.0 2 2 4 0 0 1 重要挽留用户 ... ... ... ... ... ... ... ... ... ... ... ... 426420 2238388 2.0 226.0 119.0 2 1 2 0 0 0 一般挽留用户 426421 2238389 4.0 461.0 0.0 5 2 3 1 0 1 重要发展用户 426422 2238396 5.0 193.0 44.0 2 2 1 0 0 0 一般挽留用户 426423 2238397 1.0 258.0 87.0 2 1 2 0 0 0 一般挽留用户 426424 2238403 3.0 256.0 52.0 2 1 2 0 0 0 一般挽留用户
426425 rows × 11 columns
rfm_new = pd. DataFrame( rfm. groupby( 'RFM' , as_index= False ) [ 'sampleid' ] . agg( 'count' ) )
rfm_new
RFM sampleid 0 一般价值用户 78592 1 一般保持用户 46850 2 一般发展用户 42275 3 一般挽留用户 83394 4 重要价值用户 63595 5 重要保持用户 38850 6 重要发展用户 20235 7 重要挽留用户 52634
plt. figure( figsize= ( 12 , 6 ) )
plt. pie( ( rfm_new[ 'sampleid' ] / rfm_new[ 'sampleid' ] . sum ( ) ) . to_list( ) , labels= rfm_new[ 'RFM' ] . to_list( ) , autopct= '%0.2f%%' )
[Text(0.9207056795449674, 0.6019144886556893, '一般价值用户'),
Text(0.07432600254562635, 1.0974860570164833, '一般保持用户'),
Text(-0.6110720650087508, 0.9146534487804336, '一般发展用户'),
Text(-1.0982775444537256, 0.061534017816936265, '一般挽留用户'),
Text(-0.5691837587192285, -0.9412915854347425, '重要价值用户'),
Text(0.23025758705583027, -1.0756307189752563, '重要保持用户'),
Text(0.6623554620643879, -0.8782284679247601, '重要发展用户'),
Text(1.0183302486279413, -0.4159368999371846, '重要挽留用户')],
[Text(0.5022030979336185, 0.3283169938121941, '18.43%'),
Text(0.04054145593397801, 0.5986287583726272, '10.99%'),
Text(-0.33331203545931853, 0.49890188115296374, '9.91%'),
Text(-0.5990604787929412, 0.03356400971832887, '19.56%'),
Text(-0.31046386839230644, -0.5134317738734958, '14.91%'),
Text(0.1255950474849983, -0.5867076648955943, '9.11%'),
Text(0.3612847974896661, -0.47903370977714177, '4.75%'),
Text(0.5554528628879679, -0.22687467269300973, '12.34%')])