Java tutorial
/* * * * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. * See the NOTICE file distributed with this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 (the "License"); you may not use * this file except in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License is * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and limitations under the License. * * */ package eu.amidst.huginlink.examples.learning; import com.google.common.base.Stopwatch; import eu.amidst.core.datastream.DataInstance; import eu.amidst.core.datastream.DataStream; import eu.amidst.core.io.BayesianNetworkLoader; import eu.amidst.core.learning.parametric.ParallelMaximumLikelihood; import eu.amidst.core.models.BayesianNetwork; import eu.amidst.core.utils.BayesianNetworkSampler; import eu.amidst.huginlink.learning.ParallelTAN; import java.util.ArrayList; import java.util.Arrays; /** * This example shows how to use <a href="https://www.hugin.com">Hugin</a>'s functionality to learn in parallel a TAN model. * An important remark is that Hugin only allows to learn the TAN model for a data set completely loaded into RAM * memory. The case where our data set does not fit into memory, it solved in AMIDST in the following way. We learn * the structure using a smaller data set produced by <a href="https://en.wikipedia.org/wiki/Reservoir_sampling">Reservoir sampling</a> * and, then, we use AMIDST's {@link ParallelMaximumLikelihood} to learn the parameters of the TAN over the whole data set. * * <p> For further details about the implementation of the parallel TAN algorithm look at the following paper: </p> * * <i> Madsen, A.L. et al. A New Method for Vertical Parallelisation of TAN Learning Based on Balanced Incomplete * Block Designs. Probabilistic Graphical Models. Lecture Notes in Computer Science Volume 8754, 2014, pp 302-317. </i> */ public class ParallelTANExample { public static void main(String[] args) throws Exception { //We load a Bayesian network to generate a data stream //using BayesianNewtorkSampler class. int sampleSize = 100000; BayesianNetwork bn = BayesianNetworkLoader.loadFromFile("networks/dataWeka/Pigs.bn"); BayesianNetworkSampler sampler = new BayesianNetworkSampler(bn); //We fix the number of samples in memory used for performing the structural learning. //They are randomly sub-sampled using Reservoir sampling. int samplesOnMemory = 5000; //We make different trials with different number of cores ArrayList<Integer> vNumCores = new ArrayList(Arrays.asList(1, 2, 3, 4)); for (Integer numCores : vNumCores) { System.out.println( "Learning TAN: " + samplesOnMemory + " samples on memory, " + numCores + " core/s ..."); DataStream<DataInstance> data = sampler.sampleToDataStream(sampleSize); //The class ParallelTAN is created ParallelTAN tan = new ParallelTAN(); //We activate the parallel mode. tan.setParallelMode(true); //We set the number of cores to be used for the structural learning tan.setNumCores(numCores); //We set the number of samples to be used for the learning the structure tan.setNumSamplesOnMemory(samplesOnMemory); //We set the root variable to be first variable tan.setNameRoot(bn.getVariables().getListOfVariables().get(0).getName()); //We set the class variable to be the last variable tan.setNameTarget(bn.getVariables().getListOfVariables() .get(bn.getVariables().getListOfVariables().size() - 1).getName()); Stopwatch watch = Stopwatch.createStarted(); //We just invoke this mode to learn the TAN model for the data stream BayesianNetwork model = tan.learn(data); System.out.println(watch.stop()); } } }