List of usage examples for org.apache.commons.lang ArrayUtils indexOf
public static int indexOf(boolean[] array, boolean valueToFind)
Finds the index of the given value in the array.
From source file:org.apache.mahout.df.data.Dataset.java
/** * Converts a token to its corresponding int code for a given attribute * /*from ww w. ja v a2s . c o m*/ * @param attr * attribute's index * @param token * @return */ public int valueOf(int attr, String token) { if (isNumerical(attr)) { throw new IllegalArgumentException("Only for CATEGORICAL attributes"); } if (values == null) { throw new IllegalStateException("Values not found"); } return ArrayUtils.indexOf(values[attr], token); }
From source file:org.apache.mahout.df.data.Utils.java
/** * finds the label attribute's index/* ww w .j av a2 s.com*/ * * @param descriptor * @return * @throws Exception */ public static int findLabel(String descriptor) throws DescriptorException { Attribute[] attrs = DescriptorUtils.parseDescriptor(descriptor); return ArrayUtils.indexOf(attrs, Attribute.LABEL); }
From source file:org.apache.mahout.df.mapred.partial.PartialSequentialBuilder.java
@Override protected void runJob(JobConf job) throws IOException { // retrieve the splits TextInputFormat input = (TextInputFormat) job.getInputFormat(); InputSplit[] splits = input.getSplits(job, job.getNumMapTasks()); log.debug("Nb splits : {}", splits.length); InputSplit[] sorted = Arrays.copyOf(splits, splits.length); Builder.sortSplits(sorted);//from ww w .j a v a2 s . c o m int numTrees = Builder.getNbTrees(job); // total number of trees firstOutput = new PartialOutputCollector(numTrees); Reporter reporter = Reporter.NULL; firstIds = new int[splits.length]; sizes = new int[splits.length]; // to compute firstIds, process the splits in file order int firstId = 0; long slowest = 0; // duration of slowest map for (InputSplit split : splits) { int hp = ArrayUtils.indexOf(sorted, split); // hadoop's partition RecordReader<LongWritable, Text> reader = input.getRecordReader(split, job, reporter); LongWritable key = reader.createKey(); Text value = reader.createValue(); Step1Mapper mapper = new MockStep1Mapper(getTreeBuilder(), dataset, getSeed(), hp, splits.length, numTrees); long time = System.currentTimeMillis(); firstIds[hp] = firstId; while (reader.next(key, value)) { mapper.map(key, value, firstOutput, reporter); firstId++; sizes[hp]++; } mapper.close(); time = System.currentTimeMillis() - time; log.info("Duration : {}", DFUtils.elapsedTime(time)); if (time > slowest) { slowest = time; } } log.info("Longest duration : {}", DFUtils.elapsedTime(slowest)); }
From source file:org.apache.mahout.df.mapred.partial.Step0Job.java
/** * Replaces the first id for each partition in Hadoop's order * //from ww w.j ava 2 s .com * @param keys * @param values * @return */ protected static Step0Output[] processOutput(int[] keys, Step0Output[] values) { int numMaps = values.length; // sort the values using firstId Step0Output[] sorted = Arrays.copyOf(values, numMaps); Arrays.sort(sorted); // compute the partitions firstIds (file order) int[] orderedIds = new int[numMaps]; orderedIds[0] = 0; for (int p = 1; p < numMaps; p++) { orderedIds[p] = orderedIds[p - 1] + sorted[p - 1].size; } // update the values' first ids for (int p = 0; p < numMaps; p++) { int order = ArrayUtils.indexOf(sorted, values[p]); values[p].firstId = orderedIds[order]; } // reorder the values in hadoop's order Step0Output[] reordered = new Step0Output[numMaps]; for (int p = 0; p < numMaps; p++) { reordered[keys[p]] = values[p]; } return reordered; }
From source file:org.apache.mahout.df.mapred.partial.Step2MapperTest.java
public void testMapper() throws Exception { Random rng = RandomUtils.getRandom(); // prepare the data String descriptor = Utils.randomDescriptor(rng, nbAttributes); double[][] source = Utils.randomDoubles(rng, descriptor, nbInstances); String[] sData = Utils.double2String(source); Dataset dataset = DataLoader.generateDataset(descriptor, sData); String[][] splits = Utils.splitData(sData, nbMappers); // prepare first step output TreeID[] keys = new TreeID[nbTrees]; Node[] trees = new Node[nbTrees]; int[] sizes = new int[nbMappers]; int treeIndex = 0; for (int partition = 0; partition < nbMappers; partition++) { int nbMapTrees = Step1Mapper.nbTrees(nbMappers, nbTrees, partition); for (int tree = 0; tree < nbMapTrees; tree++, treeIndex++) { keys[treeIndex] = new TreeID(partition, treeIndex); // put the partition in the leaf's label // this way we can track the outputs trees[treeIndex] = new Leaf(partition); }// www . jav a 2 s. c o m sizes[partition] = splits[partition].length; } // store the first step outputs in a file FileSystem fs = FileSystem.getLocal(new Configuration()); Path forestPath = new Path("testdata/Step2MapperTest.forest"); InterResults.store(fs, forestPath, keys, trees, sizes); LongWritable key = new LongWritable(); Text value = new Text(); for (int partition = 0; partition < nbMappers; partition++) { String[] split = splits[partition]; // number of trees that will be handled by the mapper int nbConcerned = Step2Mapper.nbConcerned(nbMappers, nbTrees, partition); PartialOutputCollector output = new PartialOutputCollector(nbConcerned); // load the current mapper's (key, tree) pairs TreeID[] curKeys = new TreeID[nbConcerned]; Node[] curTrees = new Node[nbConcerned]; InterResults.load(fs, forestPath, nbMappers, nbTrees, partition, curKeys, curTrees); // simulate the job MockStep2Mapper mapper = new MockStep2Mapper(partition, dataset, curKeys, curTrees, split.length); for (int index = 0; index < split.length; index++) { key.set(index); value.set(split[index]); mapper.map(key, value, output, Reporter.NULL); } mapper.close(); // make sure the mapper did not return its own trees assertEquals(nbConcerned, output.nbOutputs()); // check the returned results int current = 0; for (int index = 0; index < nbTrees; index++) { if (keys[index].partition() == partition) { // should not be part of the results continue; } TreeID k = output.getKeys()[current]; // the tree should receive the partition's index assertEquals(partition, k.partition()); // make sure all the trees of the other partitions are handled in the // correct order assertEquals(index, k.treeId()); int[] predictions = output.getValues()[current].getPredictions(); // all the instances of the partition should be classified assertEquals(split.length, predictions.length); assertEquals("at least one instance of the partition was not classified", -1, ArrayUtils.indexOf(predictions, -1)); // the tree must not belong to the mapper's partition int treePartition = predictions[0]; assertFalse("Step2Mapper returned a tree from its own partition", partition == treePartition); current++; } } }
From source file:org.apache.mahout.df.mapreduce.partial.PartialSequentialBuilder.java
@Override protected boolean runJob(Job job) throws IOException, InterruptedException { Configuration conf = job.getConfiguration(); // retrieve the splits TextInputFormat input = new TextInputFormat(); List<InputSplit> splits = input.getSplits(job); int nbSplits = splits.size(); log.debug("Nb splits : {}", nbSplits); InputSplit[] sorted = new InputSplit[nbSplits]; splits.toArray(sorted);/*from ww w . ja v a 2 s .co m*/ Builder.sortSplits(sorted); int numTrees = Builder.getNbTrees(conf); // total number of trees TaskAttemptContext task = new TaskAttemptContext(conf, new TaskAttemptID()); firstOutput = new MockContext(new Step1Mapper(), conf, task.getTaskAttemptID(), numTrees); firstIds = new int[nbSplits]; sizes = new int[nbSplits]; // to compute firstIds, process the splits in file order long slowest = 0; // duration of slowest map int firstId = 0; for (int p = 0; p < nbSplits; p++) { InputSplit split = splits.get(p); int hp = ArrayUtils.indexOf(sorted, split); // hadoop's partition RecordReader<LongWritable, Text> reader = input.createRecordReader(split, task); reader.initialize(split, task); Step1Mapper mapper = new MockStep1Mapper(getTreeBuilder(), dataset, getSeed(), hp, nbSplits, numTrees); long time = System.currentTimeMillis(); firstIds[hp] = firstId; while (reader.nextKeyValue()) { mapper.map(reader.getCurrentKey(), reader.getCurrentValue(), firstOutput); firstId++; sizes[hp]++; } mapper.cleanup(firstOutput); time = System.currentTimeMillis() - time; log.info("Duration : {}", DFUtils.elapsedTime(time)); if (time > slowest) { slowest = time; } } log.info("Longest duration : {}", DFUtils.elapsedTime(slowest)); return true; }
From source file:org.apache.mahout.df.mapreduce.partial.Step0Job.java
/** * Replaces the first id for each partition in Hadoop's order * //from w w w. j a v a 2 s . c om * @param keys * @param values * @return */ protected static Step0Output[] processOutput(List<Integer> keys, List<Step0Output> values) { int numMaps = values.size(); // sort the values using firstId Step0Output[] sorted = new Step0Output[numMaps]; values.toArray(sorted); Arrays.sort(sorted); // compute the partitions firstIds (file order) int[] orderedIds = new int[numMaps]; orderedIds[0] = 0; for (int p = 1; p < numMaps; p++) { orderedIds[p] = orderedIds[p - 1] + sorted[p - 1].size; } // update the values' first ids for (int p = 0; p < numMaps; p++) { int order = ArrayUtils.indexOf(sorted, values.get(p)); values.get(p).firstId = orderedIds[order]; } // reorder the values in hadoop's order Step0Output[] reordered = new Step0Output[numMaps]; for (int p = 0; p < numMaps; p++) { reordered[keys.get(p)] = values.get(p); } return reordered; }
From source file:org.apache.mahout.df.mapreduce.partial.Step2MapperTest.java
public void testMapper() throws Exception { Random rng = RandomUtils.getRandom(); // prepare the data String descriptor = Utils.randomDescriptor(rng, nbAttributes); double[][] source = Utils.randomDoubles(rng, descriptor, nbInstances); String[] sData = Utils.double2String(source); Dataset dataset = DataLoader.generateDataset(descriptor, sData); String[][] splits = Utils.splitData(sData, nbMappers); // prepare first step output TreeID[] keys = new TreeID[nbTrees]; Node[] trees = new Node[nbTrees]; int[] sizes = new int[nbMappers]; int treeIndex = 0; for (int partition = 0; partition < nbMappers; partition++) { int nbMapTrees = Step1Mapper.nbTrees(nbMappers, nbTrees, partition); for (int tree = 0; tree < nbMapTrees; tree++, treeIndex++) { keys[treeIndex] = new TreeID(partition, treeIndex); // put the partition in the leaf's label // this way we can track the outputs trees[treeIndex] = new Leaf(partition); }/* w w w . j ava2 s .co m*/ sizes[partition] = splits[partition].length; } // store the first step outputs in a file FileSystem fs = FileSystem.getLocal(new Configuration()); Path forestPath = new Path("testdata/Step2MapperTest.forest"); InterResults.store(fs, forestPath, keys, trees, sizes); LongWritable key = new LongWritable(); Text value = new Text(); for (int partition = 0; partition < nbMappers; partition++) { String[] split = splits[partition]; // number of trees that will be handled by the mapper int nbConcerned = Step2Mapper.nbConcerned(nbMappers, nbTrees, partition); MockContext context = new MockContext(new Step2Mapper(), new Configuration(), new TaskAttemptID(), nbConcerned); // load the current mapper's (key, tree) pairs TreeID[] curKeys = new TreeID[nbConcerned]; Node[] curTrees = new Node[nbConcerned]; InterResults.load(fs, forestPath, nbMappers, nbTrees, partition, curKeys, curTrees); // simulate the job MockStep2Mapper mapper = new MockStep2Mapper(partition, dataset, curKeys, curTrees, split.length); for (int index = 0; index < split.length; index++) { key.set(index); value.set(split[index]); mapper.map(key, value, context); } mapper.cleanup(context); // make sure the mapper did not return its own trees assertEquals(nbConcerned, context.nbOutputs()); // check the returned results int current = 0; for (int index = 0; index < nbTrees; index++) { if (keys[index].partition() == partition) { // should not be part of the results continue; } TreeID k = context.getKeys()[current]; // the tree should receive the partition's index assertEquals(partition, k.partition()); // make sure all the trees of the other partitions are handled in the // correct order assertEquals(index, k.treeId()); int[] predictions = context.getValues()[current].getPredictions(); // all the instances of the partition should be classified assertEquals(split.length, predictions.length); assertEquals("at least one instance of the partition was not classified", -1, ArrayUtils.indexOf(predictions, -1)); // the tree must not belong to the mapper's partition int treePartition = predictions[0]; assertFalse("Step2Mapper returned a tree from its own partition", partition == treePartition); current++; } } }
From source file:org.apache.mahout.df.node.CategoricalNode.java
@Override public int classify(Instance instance) { int index = ArrayUtils.indexOf(values, instance.get(attr)); if (index == -1) { // value not available, we cannot predict return -1; }//from w w w . j av a 2s .c o m return childs[index].classify(instance); }
From source file:org.apache.mahout.df.split.OptIgSplit.java
/** * Computes the split for a CATEGORICAL attribute * //ww w .ja va 2s . c o m * @param data * @param attr * @return */ private static Split categoricalSplit(Data data, int attr) { double[] values = data.values(attr); int[][] counts = new int[values.length][data.getDataset().nblabels()]; int[] countAll = new int[data.getDataset().nblabels()]; // compute frequencies for (int index = 0; index < data.size(); index++) { Instance instance = data.get(index); counts[ArrayUtils.indexOf(values, instance.get(attr))][instance.label]++; countAll[instance.label]++; } int size = data.size(); double hy = entropy(countAll, size); // H(Y) double hyx = 0.0; // H(Y|X) double invDataSize = 1.0 / size; for (int index = 0; index < values.length; index++) { size = DataUtils.sum(counts[index]); hyx += size * invDataSize * entropy(counts[index], size); } double ig = hy - hyx; return new Split(attr, ig); }