List of usage examples for org.apache.hadoop.io Text charAt
public int charAt(int position)
position
. From source file:org.apache.mahout.classifier.sequencelearning.baumwelchmapreduce.BaumWelchUtils.java
License:Apache License
public static HmmModel CreateHmmModel(int nrOfHiddenStates, int nrOfOutputStates, Path modelPath, Configuration conf) throws IOException { log.info("Entering Create Hmm Model. Model Path = {}", modelPath.toUri()); Vector initialProbabilities = new DenseVector(nrOfHiddenStates); Matrix transitionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfHiddenStates); Matrix emissionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfOutputStates); // Get the path location where the seq files encoding model are stored Path modelFilesPath = new Path(modelPath, "*"); log.info("Create Hmm Model. ModelFiles Path = {}", modelFilesPath.toUri()); Collection<Path> result = new ArrayList<Path>(); // get all filtered file names in result list FileSystem fs = modelFilesPath.getFileSystem(conf); log.info("Create Hmm Model. File System = {}", fs); FileStatus[] matches = fs.listStatus( FileUtil.stat2Paths(fs.globStatus(modelFilesPath, PathFilters.partFilter())), PathFilters.partFilter());// w ww . ja va2 s. c o m for (FileStatus match : matches) { log.info("CreateHmmmModel Adding File Match {}", match.getPath().toString()); result.add(fs.makeQualified(match.getPath())); } // iterate through the result path list for (Path path : result) { for (Pair<Writable, MapWritable> pair : new SequenceFileIterable<Writable, MapWritable>(path, true, conf)) { Text key = (Text) pair.getFirst(); log.info("CreateHmmModel Matching Seq File Key = {}", key); MapWritable valueMap = pair.getSecond(); if (key.charAt(0) == 'I') { // initial distribution stripe for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) { log.info("CreateHmmModel Initial Prob Adding Key, Value = ({} {})", ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get()); initialProbabilities.set(((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get()); } } else if (key.charAt(0) == 'T') { // transition distribution stripe // key is of the form TRANSIT_0, TRANSIT_1 etc // the number after _ is the state ID at char number 11 int stateID = Character.getNumericValue(key.charAt(8)); log.info("CreateHmmModel stateID = key.charAt(8) = {}", stateID); for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) { log.info("CreateHmmModel Transition Matrix ({}, {}) = {}", new Object[] { stateID, ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get() }); transitionMatrix.set(stateID, ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get()); } } else if (key.charAt(0) == 'E') { // emission distribution stripe // key is of the form EMIT_0, EMIT_1 etc // the number after _ is the state ID at char number 5 int stateID = Character.getNumericValue(key.charAt(5)); for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) { log.info("CreateHmmModel Emission Matrix ({}, {}) = {}", new Object[] { stateID, ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get() }); emissionMatrix.set(stateID, ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get()); } } else { throw new IllegalStateException("Error creating HmmModel from Sequence File Path"); } } } HmmModel model = new HmmModel(transitionMatrix, emissionMatrix, initialProbabilities); HmmUtils.validate(model); return model; }
From source file:org.apache.mahout.classifier.sequencelearning.hmm.hadoop.BaumWelchCombiner.java
License:Apache License
@Override protected void reduce(Text key, Iterable<MapWritable> stripes, Context context) throws IOException, InterruptedException { MapWritable sumOfStripes = new MapWritable(); if (scaling.equals("logscaling")) { for (MapWritable stripe : stripes) { for (Map.Entry e : stripe.entrySet()) { double val = ((DoubleWritable) e.getValue()).get(); if (!sumOfStripes.containsKey(e.getKey())) { sumOfStripes.put((IntWritable) e.getKey(), new DoubleWritable(val)); } else { double sumSripesVal = ((DoubleWritable) sumOfStripes.get(e.getKey())).get(); if (sumSripesVal > Double.NEGATIVE_INFINITY) { val = val + Math.log(1 + Math.exp(sumSripesVal - val)); }/* ww w .ja va2s . com*/ sumOfStripes.put((IntWritable) e.getKey(), new DoubleWritable(val)); } } } } else if (scaling.equals("rescaling")) { for (MapWritable stripe : stripes) { for (Map.Entry e : stripe.entrySet()) { if (key.charAt(0) == (int) 'I') { double val = ((DoubleWritable) e.getValue()).get(); if (!sumOfStripes.containsKey(e.getKey())) { sumOfStripes.put((IntWritable) e.getKey(), (DoubleWritable) e.getValue()); } else { val += ((DoubleWritable) sumOfStripes.get(e.getKey())).get(); sumOfStripes.put((IntWritable) e.getKey(), new DoubleWritable(val)); } } else { double[] pr = BaumWelchUtils.toDoublePair(((BytesWritable) e.getValue()).getBytes()); double num = pr[0]; double denom = pr[1]; if (!sumOfStripes.containsKey(e.getKey())) { sumOfStripes.put((IntWritable) e.getKey(), (BytesWritable) e.getValue()); } else { double[] pr1 = BaumWelchUtils .toDoublePair(((BytesWritable) sumOfStripes.get(e.getKey())).getBytes()); num += pr1[0]; denom += pr1[1]; byte[] doublePair1 = BaumWelchUtils.doublePairToByteArray(num, denom); sumOfStripes.put((IntWritable) e.getKey(), new BytesWritable(doublePair1)); } } } } } else { for (MapWritable stripe : stripes) { for (Map.Entry e : stripe.entrySet()) { double val = ((DoubleWritable) e.getValue()).get(); if (!sumOfStripes.containsKey(e.getKey())) { sumOfStripes.put((IntWritable) e.getKey(), (DoubleWritable) e.getValue()); } else { val += ((DoubleWritable) sumOfStripes.get(e.getKey())).get(); sumOfStripes.put((IntWritable) e.getKey(), new DoubleWritable(val)); } } } } context.write(key, sumOfStripes); }
From source file:org.apache.mahout.classifier.sequencelearning.hmm.hadoop.BaumWelchReducer.java
License:Apache License
@Override protected void reduce(Text key, Iterable<MapWritable> stripes, Context context) throws IOException, InterruptedException { MapWritable sumOfStripes = new MapWritable(); // Finish the Expectation Step by aggregating all posterior probabilities for one key if (scaling.equals("logscaling")) { double totalValSum = Double.NEGATIVE_INFINITY; for (MapWritable stripe : stripes) { for (Map.Entry e : stripe.entrySet()) { double val = ((DoubleWritable) e.getValue()).get(); double max = totalValSum > val ? totalValSum : val; totalValSum = max + Math.log(Math.exp(totalValSum - max) + Math.exp(val - max)); if (!sumOfStripes.containsKey(e.getKey())) { sumOfStripes.put((IntWritable) e.getKey(), new DoubleWritable(val)); } else { double sumSripesVal = ((DoubleWritable) sumOfStripes.get(e.getKey())).get(); if (sumSripesVal > Double.NEGATIVE_INFINITY) { val = val + Math.log(1 + Math.exp(sumSripesVal - val)); }/*w ww . j av a 2 s . co m*/ sumOfStripes.put((IntWritable) e.getKey(), new DoubleWritable(val)); } } } //normalize the aggregate for (Map.Entry e : sumOfStripes.entrySet()) { double val = ((DoubleWritable) e.getValue()).get(); if (totalValSum > Double.NEGATIVE_INFINITY) { val = val - totalValSum; } sumOfStripes.put((IntWritable) e.getKey(), new DoubleWritable(Math.exp(val))); } } else if (scaling.equals("rescaling")) { double totalValSum = 0.0; for (MapWritable stripe : stripes) { for (Map.Entry e : stripe.entrySet()) { if (key.charAt(0) == (int) 'I') { double val = ((DoubleWritable) e.getValue()).get(); totalValSum += val; if (!sumOfStripes.containsKey(e.getKey())) { sumOfStripes.put((IntWritable) e.getKey(), (DoubleWritable) e.getValue()); } else { val += ((DoubleWritable) sumOfStripes.get(e.getKey())).get(); sumOfStripes.put((IntWritable) e.getKey(), new DoubleWritable(val)); } } else { double[] pr = BaumWelchUtils.toDoublePair(((BytesWritable) e.getValue()).getBytes()); double num = pr[0]; double denom = pr[1]; if (!sumOfStripes.containsKey(e.getKey())) { sumOfStripes.put((IntWritable) e.getKey(), (BytesWritable) e.getValue()); } else { double[] pr1 = BaumWelchUtils .toDoublePair(((BytesWritable) sumOfStripes.get(e.getKey())).getBytes()); num += pr1[0]; denom += pr1[1]; byte[] doublePair1 = BaumWelchUtils.doublePairToByteArray(num, denom); sumOfStripes.put((IntWritable) e.getKey(), new BytesWritable(doublePair1)); } } } } if (key.charAt(0) == (int) 'I') { //normalize the aggregate for (Map.Entry e : sumOfStripes.entrySet()) { double val = ((DoubleWritable) e.getValue()).get(); if (totalValSum > 0) { val /= totalValSum; } sumOfStripes.put((IntWritable) e.getKey(), new DoubleWritable(val)); } } else { // compute the probabilities for (Map.Entry e : sumOfStripes.entrySet()) { double[] pr1 = BaumWelchUtils .toDoublePair(((BytesWritable) sumOfStripes.get(e.getKey())).getBytes()); sumOfStripes.put((IntWritable) e.getKey(), new DoubleWritable(pr1[0] / pr1[1])); } } } else { double totalValSum = 0.0; for (MapWritable stripe : stripes) { for (Map.Entry e : stripe.entrySet()) { int state = ((IntWritable) e.getKey()).get(); double val = ((DoubleWritable) e.getValue()).get(); totalValSum += val; if (!sumOfStripes.containsKey(e.getKey())) { sumOfStripes.put((IntWritable) e.getKey(), (DoubleWritable) e.getValue()); } else { val += ((DoubleWritable) sumOfStripes.get(e.getKey())).get(); sumOfStripes.put((IntWritable) e.getKey(), new DoubleWritable(val)); } } } //normalize the aggregate for (Map.Entry e : sumOfStripes.entrySet()) { double val = ((DoubleWritable) e.getValue()).get(); if (totalValSum > 0) { val /= totalValSum; } sumOfStripes.put((IntWritable) e.getKey(), new DoubleWritable(val)); } } //Write the distribution parameter vector to HDFS for the next iteration context.write(key, sumOfStripes); }
From source file:org.apache.mahout.classifier.sequencelearning.hmm.hadoop.BaumWelchUtils.java
License:Apache License
/** * Converts the sequence files present in a directory to a {@link HmmModel} model. * * @param nrOfHiddenStates Number of hidden states * @param nrOfOutputStates Number of output states * @param modelPath Location of the sequence files containing the model's distributions * @param conf Configuration object * @return HmmModel the encoded model//from www . j a v a 2 s. c om * @throws IOException */ public static HmmModel createHmmModel(int nrOfHiddenStates, int nrOfOutputStates, Path modelPath, Configuration conf) throws IOException { log.info("Entering Create Hmm Model. Model Path = {}", modelPath.toUri()); Vector initialProbabilities = new DenseVector(nrOfHiddenStates); Matrix transitionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfHiddenStates); Matrix emissionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfOutputStates); // Get the path location where the seq files encoding model are stored Path modelFilesPath = new Path(modelPath, "*"); Collection<Path> result = new ArrayList<Path>(); // get all filtered file names in result list FileSystem fs = modelFilesPath.getFileSystem(conf); FileStatus[] matches = fs.listStatus( FileUtil.stat2Paths(fs.globStatus(modelFilesPath, PathFilters.partFilter())), PathFilters.partFilter()); for (FileStatus match : matches) { result.add(fs.makeQualified(match.getPath())); } // iterate through the result path list for (Path path : result) { for (Pair<Writable, MapWritable> pair : new SequenceFileIterable<Writable, MapWritable>(path, true, conf)) { Text key = (Text) pair.getFirst(); MapWritable valueMap = pair.getSecond(); if (key.charAt(0) == (int) 'I') { // initial distribution stripe for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) { initialProbabilities.set(((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get()); } } else if (key.charAt(0) == (int) 'T') { // transition distribution stripe // key is of the form TRANSIT_0, TRANSIT_1 etc int stateID = Integer.parseInt(key.toString().split("_")[1]); for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) { transitionMatrix.set(stateID, ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get()); } } else if (key.charAt(0) == (int) 'E') { // emission distribution stripe // key is of the form EMIT_0, EMIT_1 etc int stateID = Integer.parseInt(key.toString().split("_")[1]); for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) { emissionMatrix.set(stateID, ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get()); } } else { throw new IllegalStateException("Error creating HmmModel from Sequence File Path"); } } } HmmModel model = new HmmModel(transitionMatrix, emissionMatrix, initialProbabilities); if (model != null) { return model; } else throw new IOException("Error building model from output location"); }