CombineML for Seamless Ensembling of Machine Learning Models from Scikit-learn, Caret, and Julia

Create learners for parallel processing

nprocs() ==1 && addprocs()

@everywhere import CombineML.Util

@everywhere import CombineML.Transformers

@everywhere import RDatasets

@everywhere CU=CombineML.Util

@everywhere CT=CombineML.Transformers

@everywhere RD=RDatasets

#Scikit wrapper that provides access to scikit learners

@everywhere sk_gblearner = CT.SKLLearner(

Dict( :output => :class,

:learner => “GradientBoostingClassifier”,

:impl_options => Dict()

) )

Select best learner from the set

@everywhere best_learner = CT.BestLearner(

Dict(

:learners => [sk_gblearner,CT.PrunedTree(),CT.RandomForest()],

:output => :class, :score_type => Real, :learner_options_grid => nothing

)

Learners in voting committee

@everywhere vote_learner = CT.VoteEnsemble(

Dict(

:output => :class,

:learners => [

CT.PrunedTree(), CT.DecisionStumpAdaboost(),

CT.RandomForest(),sk_gblearner,best_learner

]

)

Learners in stack ensemble

@everywhere stack_learner = CT.StackEnsemble(

Dict(

:learners => [

CT.PrunedTree(), CT.RandomForest(),

CT.DecisionStumpAdaboost(),

best_learner, sk_gblearner

:stacker => CT.RandomForest()

) )

Create a Pipeline

pipeline = CT.Pipeline(

Dict(

:transformers => [

CT.OneHotEncoder(), # Encodes nominal features into numeric

CT.Imputer(), # Imputes NA values

CT.StandardScaler(), # Standardizes features

learner

]

) )

Sample dataset from Iris

dataset = RD.dataset(“datasets”, “iris”)

instances = Array(dataset[:, 1:(end-1)])

labels = Array(dataset[:, end])

Split into training and test sets

(train_ind, test_ind) = CU.holdout(size(instances, 1), 0.3)

Train and Predict

CT.fit!(pipeline, instances[train_ind, :], labels[train_ind])

predictions = CT.transform!(pipeline, instances[test_ind, :])

Compute accuracy

result = CU.score(:accuracy, labels[test_ind], predictions)

Evaluate Ensembles in Parallel

@everywhere function predict(learner)

dataset = RD.dataset(“datasets”, “iris”)

instances = Array(dataset[:, 1:(end-1)])

labels = Array(dataset[:, end])

(train_ind, test_ind) = CU.holdout(size(instances, 1), 0.3)

pipeline = CT.Pipeline(

Dict(

:transformers => [

CT.OneHotEncoder(), # Encodes nominal features into numeric

CT.Imputer(), # Imputes NA values

CT.StandardScaler(), # Standardizes features

CT.PCA(),

learner

]

)

CT.fit!(pipeline, instances[train_ind, :], labels[train_ind]);

predictions = CT.transform!(pipeline, instances[test_ind, :]);

result = CU.score(:accuracy, labels[test_ind], predictions)

return result

end

using DataFrames

function main(trials)

learners=Dict(

:gradientboosting=>sk_gblearner, :randomforest=>CT.RandomForest(),

:adaboost=>CT.DecisionStumpAdaboost(), :votelearner=>vote_learner,

:bestlearner=>best_learner, :stacklearner=>stack_learner )

models=collect(keys(learners))

ctable=@parallel (vcat) for model in models

acc=@parallel (vcat) for i=1:trials

res=predict(learners[model])

println(model,” => “,round(res))

res

end

[model round(mean(acc)) round(std(acc)) length(acc)]

end

sorted=sortrows(ctable,by=(x)->x[2],rev=true)

> DataFrame

rename!(sorted,Dict(:x1=>:model,:x2=>:mean_acc,:x3=>:std_acc,:x4=>:trials))

return sorted

end

const trials = 5

res = main(trials)

@show res

Known Limitations

Learners have only been tested on instances with numeric features.
Inconsistencies may result in using nominal features directly without a numeric transformation (i.e. OneHotEncoder).

Speaker's bio

I am a research scientist at the IBM Dublin Research Lab working in the areas of analytics, datamining, machine learning, and AI. I finished my Doctor of Engineering degree from the Toyohashi University of Technology in Japan (2005). I have a Master’s degree in Computer Science majoring in Artificial Intelligence (Ateneo de Manila University, 1995) and a Bachelor’s degree in Applied Mathematics (University of the Philippines in the Visayas, 1991). I used to work as a technical staff for two years in the Neuroinformatics Lab of RIKEN Brain Science Institute, Japan before finishing my DEng degree. I spent a total of 4 years as a Postdoctoral Fellow in the National University of Singapore and the National Neuroscience Institute working on diverse topics such as context-aware reasoning, datamining models for activity recognition in smarthome environment, detecting biomarkers for Parkinson’s Disease by image processing of fMRI and DTI images, and automated diagnosis of movement disorder for intelligent healthcare. Moreover, I held an Asst. Professorship for a total of 6 years in the University of the Philippines and Ateneo de Manila University. My research interests include datamining, optimization, development of intelligent agents using machine learning and evolutionary computation, neuroinformatics, and biomedical engineering.