List of usage examples for org.apache.hadoop.io MapWritable entrySet
@Override
public Set<Map.Entry<Writable, Writable>> entrySet()
From source file:org.apache.mahout.classifier.sequencelearning.baumwelchmapreduce.BaumWelchReducer.java
License:Apache License
@Override protected void reduce(Text key, Iterable<MapWritable> stripes, Context context) throws IOException, InterruptedException { log.info("Entering Reducer. Key = {}", key.toString()); MapWritable sumOfStripes = new MapWritable(); MapWritable finalStripe = new MapWritable(); boolean isInitial = false; boolean isTransit = false; boolean isEmit = false; int stateID = -1; if (key.charAt(0) == 'I') { isInitial = true;//from w w w.java2 s . c om } else if (key.charAt(0) == 'E') { isEmit = true; stateID = Character.getNumericValue(key.charAt(5)); } else if (key.charAt(0) == 'T') { isTransit = true; stateID = Character.getNumericValue(key.charAt(8)); } else { throw new IllegalStateException("Baum Welch Reducer Error Determining the Key Type"); } if (isInitial) { ; Double[] val = new Double[nrOfHiddenStates]; for (int i = 0; i < nrOfHiddenStates; i++) { val[i] = 0.0; } for (MapWritable stripe : stripes) { log.info("Reducer Processing Initial Distribution Stripe."); for (MapWritable.Entry<Writable, Writable> stripeEntry : stripe.entrySet()) { log.info("Reducer Getting Initial Distribution Stripe Entry. Key = {} Value = {} ", Integer.toString(((IntWritable) stripeEntry.getKey()).get()), Double.toString(((DoubleWritable) stripeEntry.getValue()).get())); val[((IntWritable) stripeEntry.getKey()).get()] += ((DoubleWritable) stripeEntry.getValue()) .get(); } } for (int i = 0; i < nrOfHiddenStates; i++) { log.info("Reducer adding to sumOfStripes for Initial. Key = {} Value ={}", Integer.toString(i), Double.toString(val[i])); sumOfStripes.put(new IntWritable(i), new DoubleWritable(val[i])); } } else if (isEmit) { Iterator<MapWritable> it = stripes.iterator(); int seqlength = it.next().size(); Double[] val = new Double[nrOfEmittedStates]; for (int i = 0; i < nrOfEmittedStates; i++) { val[i] = 0.0; } for (MapWritable stripe : stripes) { log.info("Reducer Processing Emission Distribution Stripe."); for (MapWritable.Entry<Writable, Writable> stripeEntry : stripe.entrySet()) { log.info("Reducer Getting Emission Distribution Stripe Entry. Key = {} Value = {} ", Integer.toString(((IntWritable) stripeEntry.getKey()).get()), Double.toString(((DoubleWritable) stripeEntry.getValue()).get())); val[((IntWritable) stripeEntry.getKey()).get()] += ((DoubleWritable) stripeEntry.getValue()) .get(); } } for (int i = 0; i < nrOfEmittedStates; i++) { log.info("Reducer adding to sumOfStripes for Emission. Key = {} Value ={}", Integer.toString(i), Double.toString(val[i])); sumOfStripes.put(new IntWritable(i), new DoubleWritable(val[i])); } } else if (isTransit) { Double[] val = new Double[nrOfHiddenStates]; for (int i = 0; i < nrOfHiddenStates; i++) { val[i] = 0.0; } for (MapWritable stripe : stripes) { log.info("Reducer Processing Transition Distribution Stripe."); for (MapWritable.Entry<Writable, Writable> stripeEntry : stripe.entrySet()) { log.info("Reducer Getting Transition Distribution Stripe Entry. Key = {} Value = {} ", Integer.toString(((IntWritable) stripeEntry.getKey()).get()), Double.toString(((DoubleWritable) stripeEntry.getValue()).get())); val[((IntWritable) stripeEntry.getKey()).get()] += ((DoubleWritable) stripeEntry.getValue()) .get(); } } for (int i = 0; i < nrOfHiddenStates; i++) { log.info("Reducer adding to sumOfStripes for Transition. Key = {} Value ={}", Integer.toString(i), Double.toString(val[i])); sumOfStripes.put(new IntWritable(i), new DoubleWritable(val[i])); } } else { throw new IllegalStateException("Baum Welch Reducer Error: Unable to aggregate distribution stripes."); } double sum = 0.0; for (MapWritable.Entry<Writable, Writable> sumEntry : sumOfStripes.entrySet()) { sum += ((DoubleWritable) sumEntry.getValue()).get(); } //DoubleWritable normalizedSum = new DoubleWritable(0.0); //double[] innerValues = new double[sumOfStripes.size()]; int index = 0; MapWritable distributionStripe = new MapWritable(); for (MapWritable.Entry<Writable, Writable> sumEntry : sumOfStripes.entrySet()) { IntWritable state = (IntWritable) sumEntry.getKey(); double innerValue = ((DoubleWritable) sumEntry.getValue()).get(); double normalizedSum = innerValue / sum; //innerValues[index++] = normalizedSum; distributionStripe.put(state, new DoubleWritable(normalizedSum)); //finalStripe.put(((IntWritable)sumEntry.getKey()), val); } log.info("Reducer Writing: Key = {} Value (Stripe) Size = {}", key.toString(), finalStripe.size()); for (MapWritable.Entry<Writable, Writable> entry : finalStripe.entrySet()) { log.info("Distribution Stripe Detail Key = {}, Value ={}", ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get()); } context.write(key, distributionStripe); }
From source file:org.apache.mahout.classifier.sequencelearning.baumwelchmapreduce.BaumWelchUtils.java
License:Apache License
public static HmmModel CreateHmmModel(int nrOfHiddenStates, int nrOfOutputStates, Path modelPath, Configuration conf) throws IOException { log.info("Entering Create Hmm Model. Model Path = {}", modelPath.toUri()); Vector initialProbabilities = new DenseVector(nrOfHiddenStates); Matrix transitionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfHiddenStates); Matrix emissionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfOutputStates); // Get the path location where the seq files encoding model are stored Path modelFilesPath = new Path(modelPath, "*"); log.info("Create Hmm Model. ModelFiles Path = {}", modelFilesPath.toUri()); Collection<Path> result = new ArrayList<Path>(); // get all filtered file names in result list FileSystem fs = modelFilesPath.getFileSystem(conf); log.info("Create Hmm Model. File System = {}", fs); FileStatus[] matches = fs.listStatus( FileUtil.stat2Paths(fs.globStatus(modelFilesPath, PathFilters.partFilter())), PathFilters.partFilter());//from ww w.j a va 2 s .c o m for (FileStatus match : matches) { log.info("CreateHmmmModel Adding File Match {}", match.getPath().toString()); result.add(fs.makeQualified(match.getPath())); } // iterate through the result path list for (Path path : result) { for (Pair<Writable, MapWritable> pair : new SequenceFileIterable<Writable, MapWritable>(path, true, conf)) { Text key = (Text) pair.getFirst(); log.info("CreateHmmModel Matching Seq File Key = {}", key); MapWritable valueMap = pair.getSecond(); if (key.charAt(0) == 'I') { // initial distribution stripe for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) { log.info("CreateHmmModel Initial Prob Adding Key, Value = ({} {})", ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get()); initialProbabilities.set(((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get()); } } else if (key.charAt(0) == 'T') { // transition distribution stripe // key is of the form TRANSIT_0, TRANSIT_1 etc // the number after _ is the state ID at char number 11 int stateID = Character.getNumericValue(key.charAt(8)); log.info("CreateHmmModel stateID = key.charAt(8) = {}", stateID); for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) { log.info("CreateHmmModel Transition Matrix ({}, {}) = {}", new Object[] { stateID, ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get() }); transitionMatrix.set(stateID, ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get()); } } else if (key.charAt(0) == 'E') { // emission distribution stripe // key is of the form EMIT_0, EMIT_1 etc // the number after _ is the state ID at char number 5 int stateID = Character.getNumericValue(key.charAt(5)); for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) { log.info("CreateHmmModel Emission Matrix ({}, {}) = {}", new Object[] { stateID, ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get() }); emissionMatrix.set(stateID, ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get()); } } else { throw new IllegalStateException("Error creating HmmModel from Sequence File Path"); } } } HmmModel model = new HmmModel(transitionMatrix, emissionMatrix, initialProbabilities); HmmUtils.validate(model); return model; }
From source file:org.apache.mahout.classifier.sequencelearning.baumwelchmapreduce.BaumWelchUtils.java
License:Apache License
protected static void WriteModelToDirectory(HmmModel model, Path modelPath, Configuration conf) throws IOException { int numHidden = model.getNrOfHiddenStates(); int numObserved = model.getNrOfOutputStates(); Matrix emissionMatrix = model.getEmissionMatrix(); Matrix transitionMatrix = model.getTransitionMatrix(); Vector initialProbability = model.getInitialProbabilities(); MapWritable initialDistributionMap = new MapWritable(); MapWritable transitionDistributionMap = new MapWritable(); MapWritable emissionDistributionMap = new MapWritable(); // delete the output directory HadoopUtil.delete(conf, modelPath);// www.j av a 2s. c o m // create new file to store HMM FileSystem fs = FileSystem.get(modelPath.toUri(), conf); Path outFile = new Path(modelPath, "part-randomSeed"); boolean newFile = fs.createNewFile(outFile); if (newFile) { SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, outFile, Text.class, MapWritable.class); try { // construct one MapWritable<IntWritable, DoubleWritable> object // and two MapWritable<Text, MapWritable<IntWritable, DoubleWritable >> objects for (int i = 0; i < numHidden; i++) { IntWritable initialDistributionKey = new IntWritable(i); DoubleWritable initialDistributionValue = new DoubleWritable(initialProbability.get(i)); log.info("BuildRandomModel Initial Distribution Map: State {} = {})", initialDistributionKey.get(), initialDistributionValue.get()); initialDistributionMap.put(initialDistributionKey, initialDistributionValue); Text transitionDistributionKey = new Text("TRANSIT_" + Integer.toString(i)); MapWritable transitionDistributionValue = new MapWritable(); for (int j = 0; j < numHidden; j++) { IntWritable transitionDistributionInnerKey = new IntWritable(j); DoubleWritable transitionDistributionInnerValue = new DoubleWritable( transitionMatrix.get(i, j)); log.info("BuildRandomModel Transition Distribution Map Inner: ({}, {}) = ({}, {})", new Object[] { i, j, transitionDistributionInnerKey.get(), transitionDistributionInnerValue.get() }); transitionDistributionValue.put(transitionDistributionInnerKey, transitionDistributionInnerValue); } transitionDistributionMap.put(transitionDistributionKey, transitionDistributionValue); Text emissionDistributionKey = new Text("EMIT_" + Integer.toString(i)); MapWritable emissionDistributionValue = new MapWritable(); for (int j = 0; j < numObserved; j++) { IntWritable emissionDistributionInnerKey = new IntWritable(j); DoubleWritable emissionDistributionInnerValue = new DoubleWritable( emissionMatrix.get(i, j)); log.info("BuildRandomModel Emission Distribution Map Inner: ({}, {}) = ({}, {})", new Object[] { i, j, emissionDistributionInnerKey.get(), emissionDistributionInnerValue.get() }); emissionDistributionValue.put(emissionDistributionInnerKey, emissionDistributionInnerValue); } emissionDistributionMap.put(emissionDistributionKey, emissionDistributionValue); } writer.append(new Text("INITIAL"), initialDistributionMap); log.info("Wrote random Initial Distribution Map to {}", outFile); for (MapWritable.Entry<Writable, Writable> transitionEntry : transitionDistributionMap.entrySet()) { log.info("Writing Transition Distribution Map Key, Value = ({}, {})", transitionEntry.getKey(), transitionEntry.getValue()); writer.append(transitionEntry.getKey(), transitionEntry.getValue()); } log.info("Wrote random Transition Distribution Map to {}", outFile); for (MapWritable.Entry<Writable, Writable> emissionEntry : emissionDistributionMap.entrySet()) { log.info("Writing Emission Distribution Map Key, Value = ({}, {})", emissionEntry.getKey(), emissionEntry.getValue()); writer.append(emissionEntry.getKey(), emissionEntry.getValue()); } log.info("Wrote random Emission Distribution Map to {}", outFile); } finally { Closeables.closeQuietly(writer); } } }
From source file:org.apache.mahout.classifier.sequencelearning.hmm.hadoop.BaumWelchCombiner.java
License:Apache License
@Override protected void reduce(Text key, Iterable<MapWritable> stripes, Context context) throws IOException, InterruptedException { MapWritable sumOfStripes = new MapWritable(); if (scaling.equals("logscaling")) { for (MapWritable stripe : stripes) { for (Map.Entry e : stripe.entrySet()) { double val = ((DoubleWritable) e.getValue()).get(); if (!sumOfStripes.containsKey(e.getKey())) { sumOfStripes.put((IntWritable) e.getKey(), new DoubleWritable(val)); } else { double sumSripesVal = ((DoubleWritable) sumOfStripes.get(e.getKey())).get(); if (sumSripesVal > Double.NEGATIVE_INFINITY) { val = val + Math.log(1 + Math.exp(sumSripesVal - val)); }// w w w.j a v a 2 s. c o m sumOfStripes.put((IntWritable) e.getKey(), new DoubleWritable(val)); } } } } else if (scaling.equals("rescaling")) { for (MapWritable stripe : stripes) { for (Map.Entry e : stripe.entrySet()) { if (key.charAt(0) == (int) 'I') { double val = ((DoubleWritable) e.getValue()).get(); if (!sumOfStripes.containsKey(e.getKey())) { sumOfStripes.put((IntWritable) e.getKey(), (DoubleWritable) e.getValue()); } else { val += ((DoubleWritable) sumOfStripes.get(e.getKey())).get(); sumOfStripes.put((IntWritable) e.getKey(), new DoubleWritable(val)); } } else { double[] pr = BaumWelchUtils.toDoublePair(((BytesWritable) e.getValue()).getBytes()); double num = pr[0]; double denom = pr[1]; if (!sumOfStripes.containsKey(e.getKey())) { sumOfStripes.put((IntWritable) e.getKey(), (BytesWritable) e.getValue()); } else { double[] pr1 = BaumWelchUtils .toDoublePair(((BytesWritable) sumOfStripes.get(e.getKey())).getBytes()); num += pr1[0]; denom += pr1[1]; byte[] doublePair1 = BaumWelchUtils.doublePairToByteArray(num, denom); sumOfStripes.put((IntWritable) e.getKey(), new BytesWritable(doublePair1)); } } } } } else { for (MapWritable stripe : stripes) { for (Map.Entry e : stripe.entrySet()) { double val = ((DoubleWritable) e.getValue()).get(); if (!sumOfStripes.containsKey(e.getKey())) { sumOfStripes.put((IntWritable) e.getKey(), (DoubleWritable) e.getValue()); } else { val += ((DoubleWritable) sumOfStripes.get(e.getKey())).get(); sumOfStripes.put((IntWritable) e.getKey(), new DoubleWritable(val)); } } } } context.write(key, sumOfStripes); }
From source file:org.apache.mahout.classifier.sequencelearning.hmm.hadoop.BaumWelchMapper.java
License:Apache License
@Override public void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration config = context.getConfiguration(); String scalingMethod = config.get(BaumWelchConfigKeys.SCALING_OPTION_KEY); if (scalingMethod.equals("rescaling")) { scaling = HmmAlgorithms.ScalingMethod.RESCALING; } else if (scalingMethod.equals("logscaling")) { scaling = HmmAlgorithms.ScalingMethod.LOGSCALING; }// ww w .ja v a2 s.c om nrOfHiddenStates = Integer.parseInt(config.get(BaumWelchConfigKeys.NUMBER_OF_HIDDEN_STATES_KEY)); nrOfEmittedStates = Integer.parseInt(config.get(BaumWelchConfigKeys.NUMBER_OF_EMITTED_STATES_KEY)); MapWritable hiddenStatesWritableMap = MapWritableCache.load(config, new Path(config.get(BaumWelchConfigKeys.HIDDEN_STATES_MAP_PATH))); MapWritable emittedStatesWritableMap = MapWritableCache.load(config, new Path(config.get(BaumWelchConfigKeys.EMITTED_STATES_MAP_PATH))); String[] hiddenStatesArray = new String[hiddenStatesWritableMap.size()]; String[] emittedStatesArray = new String[emittedStatesWritableMap.size()]; int k = 0; int l = 0; for (MapWritable.Entry<Writable, Writable> entry : hiddenStatesWritableMap.entrySet()) { hiddenStatesArray[k++] = ((entry.getKey())).toString(); } for (MapWritable.Entry<Writable, Writable> entry : emittedStatesWritableMap.entrySet()) { emittedStatesArray[l++] = ((entry.getKey())).toString(); } modelPath = new Path(config.get(BaumWelchConfigKeys.MODEL_PATH_KEY)); Model = BaumWelchUtils.createHmmModel(nrOfHiddenStates, nrOfEmittedStates, modelPath, config); Model.registerHiddenStateNames(hiddenStatesArray); Model.registerOutputStateNames(emittedStatesArray); HmmUtils.normalizeModel(Model); HmmUtils.validate(Model); log.info("Mapper Setup Hmm Model Created. Hidden States = {} Emitted States = {}", Model.getNrOfHiddenStates(), Model.getNrOfOutputStates()); }
From source file:org.apache.mahout.classifier.sequencelearning.hmm.hadoop.BaumWelchReducer.java
License:Apache License
@Override protected void reduce(Text key, Iterable<MapWritable> stripes, Context context) throws IOException, InterruptedException { MapWritable sumOfStripes = new MapWritable(); // Finish the Expectation Step by aggregating all posterior probabilities for one key if (scaling.equals("logscaling")) { double totalValSum = Double.NEGATIVE_INFINITY; for (MapWritable stripe : stripes) { for (Map.Entry e : stripe.entrySet()) { double val = ((DoubleWritable) e.getValue()).get(); double max = totalValSum > val ? totalValSum : val; totalValSum = max + Math.log(Math.exp(totalValSum - max) + Math.exp(val - max)); if (!sumOfStripes.containsKey(e.getKey())) { sumOfStripes.put((IntWritable) e.getKey(), new DoubleWritable(val)); } else { double sumSripesVal = ((DoubleWritable) sumOfStripes.get(e.getKey())).get(); if (sumSripesVal > Double.NEGATIVE_INFINITY) { val = val + Math.log(1 + Math.exp(sumSripesVal - val)); }//from w w w. jav a 2s .co m sumOfStripes.put((IntWritable) e.getKey(), new DoubleWritable(val)); } } } //normalize the aggregate for (Map.Entry e : sumOfStripes.entrySet()) { double val = ((DoubleWritable) e.getValue()).get(); if (totalValSum > Double.NEGATIVE_INFINITY) { val = val - totalValSum; } sumOfStripes.put((IntWritable) e.getKey(), new DoubleWritable(Math.exp(val))); } } else if (scaling.equals("rescaling")) { double totalValSum = 0.0; for (MapWritable stripe : stripes) { for (Map.Entry e : stripe.entrySet()) { if (key.charAt(0) == (int) 'I') { double val = ((DoubleWritable) e.getValue()).get(); totalValSum += val; if (!sumOfStripes.containsKey(e.getKey())) { sumOfStripes.put((IntWritable) e.getKey(), (DoubleWritable) e.getValue()); } else { val += ((DoubleWritable) sumOfStripes.get(e.getKey())).get(); sumOfStripes.put((IntWritable) e.getKey(), new DoubleWritable(val)); } } else { double[] pr = BaumWelchUtils.toDoublePair(((BytesWritable) e.getValue()).getBytes()); double num = pr[0]; double denom = pr[1]; if (!sumOfStripes.containsKey(e.getKey())) { sumOfStripes.put((IntWritable) e.getKey(), (BytesWritable) e.getValue()); } else { double[] pr1 = BaumWelchUtils .toDoublePair(((BytesWritable) sumOfStripes.get(e.getKey())).getBytes()); num += pr1[0]; denom += pr1[1]; byte[] doublePair1 = BaumWelchUtils.doublePairToByteArray(num, denom); sumOfStripes.put((IntWritable) e.getKey(), new BytesWritable(doublePair1)); } } } } if (key.charAt(0) == (int) 'I') { //normalize the aggregate for (Map.Entry e : sumOfStripes.entrySet()) { double val = ((DoubleWritable) e.getValue()).get(); if (totalValSum > 0) { val /= totalValSum; } sumOfStripes.put((IntWritable) e.getKey(), new DoubleWritable(val)); } } else { // compute the probabilities for (Map.Entry e : sumOfStripes.entrySet()) { double[] pr1 = BaumWelchUtils .toDoublePair(((BytesWritable) sumOfStripes.get(e.getKey())).getBytes()); sumOfStripes.put((IntWritable) e.getKey(), new DoubleWritable(pr1[0] / pr1[1])); } } } else { double totalValSum = 0.0; for (MapWritable stripe : stripes) { for (Map.Entry e : stripe.entrySet()) { int state = ((IntWritable) e.getKey()).get(); double val = ((DoubleWritable) e.getValue()).get(); totalValSum += val; if (!sumOfStripes.containsKey(e.getKey())) { sumOfStripes.put((IntWritable) e.getKey(), (DoubleWritable) e.getValue()); } else { val += ((DoubleWritable) sumOfStripes.get(e.getKey())).get(); sumOfStripes.put((IntWritable) e.getKey(), new DoubleWritable(val)); } } } //normalize the aggregate for (Map.Entry e : sumOfStripes.entrySet()) { double val = ((DoubleWritable) e.getValue()).get(); if (totalValSum > 0) { val /= totalValSum; } sumOfStripes.put((IntWritable) e.getKey(), new DoubleWritable(val)); } } //Write the distribution parameter vector to HDFS for the next iteration context.write(key, sumOfStripes); }
From source file:org.apache.mahout.classifier.sequencelearning.hmm.hadoop.BaumWelchUtils.java
License:Apache License
/** * Converts the sequence files present in a directory to a {@link HmmModel} model. * * @param nrOfHiddenStates Number of hidden states * @param nrOfOutputStates Number of output states * @param modelPath Location of the sequence files containing the model's distributions * @param conf Configuration object * @return HmmModel the encoded model/*from w w w.jav a 2 s .c o m*/ * @throws IOException */ public static HmmModel createHmmModel(int nrOfHiddenStates, int nrOfOutputStates, Path modelPath, Configuration conf) throws IOException { log.info("Entering Create Hmm Model. Model Path = {}", modelPath.toUri()); Vector initialProbabilities = new DenseVector(nrOfHiddenStates); Matrix transitionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfHiddenStates); Matrix emissionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfOutputStates); // Get the path location where the seq files encoding model are stored Path modelFilesPath = new Path(modelPath, "*"); Collection<Path> result = new ArrayList<Path>(); // get all filtered file names in result list FileSystem fs = modelFilesPath.getFileSystem(conf); FileStatus[] matches = fs.listStatus( FileUtil.stat2Paths(fs.globStatus(modelFilesPath, PathFilters.partFilter())), PathFilters.partFilter()); for (FileStatus match : matches) { result.add(fs.makeQualified(match.getPath())); } // iterate through the result path list for (Path path : result) { for (Pair<Writable, MapWritable> pair : new SequenceFileIterable<Writable, MapWritable>(path, true, conf)) { Text key = (Text) pair.getFirst(); MapWritable valueMap = pair.getSecond(); if (key.charAt(0) == (int) 'I') { // initial distribution stripe for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) { initialProbabilities.set(((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get()); } } else if (key.charAt(0) == (int) 'T') { // transition distribution stripe // key is of the form TRANSIT_0, TRANSIT_1 etc int stateID = Integer.parseInt(key.toString().split("_")[1]); for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) { transitionMatrix.set(stateID, ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get()); } } else if (key.charAt(0) == (int) 'E') { // emission distribution stripe // key is of the form EMIT_0, EMIT_1 etc int stateID = Integer.parseInt(key.toString().split("_")[1]); for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) { emissionMatrix.set(stateID, ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get()); } } else { throw new IllegalStateException("Error creating HmmModel from Sequence File Path"); } } } HmmModel model = new HmmModel(transitionMatrix, emissionMatrix, initialProbabilities); if (model != null) { return model; } else throw new IOException("Error building model from output location"); }
From source file:org.apache.mahout.classifier.sequencelearning.hmm.hadoop.BaumWelchUtils.java
License:Apache License
/** * Encodes a particular HmmModel as a Sequence File and write it to the specified location. * * @param model HmmModel to be encoded * @param modelPath Location to store the encoded model * @param conf Configuration object * @throws IOException/*w w w . ja v a2 s . c om*/ */ protected static void writeModelToDirectory(HmmModel model, Path modelPath, Configuration conf) throws IOException { int numHidden = model.getNrOfHiddenStates(); int numObserved = model.getNrOfOutputStates(); Matrix emissionMatrix = model.getEmissionMatrix(); Matrix transitionMatrix = model.getTransitionMatrix(); Vector initialProbability = model.getInitialProbabilities(); MapWritable initialDistributionMap = new MapWritable(); MapWritable transitionDistributionMap = new MapWritable(); MapWritable emissionDistributionMap = new MapWritable(); // delete the output directory HadoopUtil.delete(conf, modelPath); // create new file to store HMM FileSystem fs = FileSystem.get(modelPath.toUri(), conf); Path outFile = new Path(modelPath, "part-randomSeed"); boolean newFile = fs.createNewFile(outFile); if (newFile) { SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, outFile, Text.class, MapWritable.class); try { for (int i = 0; i < numHidden; i++) { IntWritable initialDistributionKey = new IntWritable(i); DoubleWritable initialDistributionValue = new DoubleWritable(initialProbability.get(i)); initialDistributionMap.put(initialDistributionKey, initialDistributionValue); Text transitionDistributionKey = new Text("TRANSIT_" + Integer.toString(i)); MapWritable transitionDistributionValue = new MapWritable(); for (int j = 0; j < numHidden; j++) { IntWritable transitionDistributionInnerKey = new IntWritable(j); DoubleWritable transitionDistributionInnerValue = new DoubleWritable( transitionMatrix.get(i, j)); transitionDistributionValue.put(transitionDistributionInnerKey, transitionDistributionInnerValue); } transitionDistributionMap.put(transitionDistributionKey, transitionDistributionValue); Text emissionDistributionKey = new Text("EMIT_" + Integer.toString(i)); MapWritable emissionDistributionValue = new MapWritable(); for (int j = 0; j < numObserved; j++) { IntWritable emissionDistributionInnerKey = new IntWritable(j); DoubleWritable emissionDistributionInnerValue = new DoubleWritable( emissionMatrix.get(i, j)); emissionDistributionValue.put(emissionDistributionInnerKey, emissionDistributionInnerValue); } emissionDistributionMap.put(emissionDistributionKey, emissionDistributionValue); } writer.append(new Text("INITIAL"), initialDistributionMap); log.info("Wrote random Initial Distribution Map to {}", outFile); for (MapWritable.Entry<Writable, Writable> transitionEntry : transitionDistributionMap.entrySet()) { writer.append(transitionEntry.getKey(), transitionEntry.getValue()); } log.info("Wrote random Transition Distribution Map to {}", outFile); for (MapWritable.Entry<Writable, Writable> emissionEntry : emissionDistributionMap.entrySet()) { writer.append(emissionEntry.getKey(), emissionEntry.getValue()); } log.info("Wrote random Emission Distribution Map to {}", outFile); } finally { Closeables.closeQuietly(writer); } } }
From source file:org.apache.nutch.crawl.CrawlDatum.java
License:Apache License
private boolean metadataEquals(org.apache.hadoop.io.MapWritable otherMetaData) { if (metaData == null || metaData.size() == 0) { return otherMetaData == null || otherMetaData.size() == 0; }/*from w ww. j av a 2 s . com*/ if (otherMetaData == null) { // we already know that the current object is not null or empty return false; } HashSet<Entry<Writable, Writable>> set1 = new HashSet<Entry<Writable, Writable>>(metaData.entrySet()); HashSet<Entry<Writable, Writable>> set2 = new HashSet<Entry<Writable, Writable>>(otherMetaData.entrySet()); return set1.equals(set2); }
From source file:org.apache.nutch.crawl.CrawlDbReducer.java
License:Apache License
public void reduce(Text key, Iterator<CrawlDatum> values, OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException { CrawlDatum fetch = new CrawlDatum(); CrawlDatum old = new CrawlDatum(); boolean fetchSet = false; boolean oldSet = false; byte[] signature = null; boolean multiple = false; // avoid deep copy when only single value exists linked.clear();/* w w w . j ava2s .co m*/ org.apache.hadoop.io.MapWritable metaFromParse = null; while (values.hasNext()) { CrawlDatum datum = (CrawlDatum) values.next(); if (!multiple && values.hasNext()) multiple = true; if (CrawlDatum.hasDbStatus(datum)) { if (!oldSet) { if (multiple) { old.set(datum); } else { // no need for a deep copy - this is the only value old = datum; } oldSet = true; } else { // always take the latest version if (old.getFetchTime() < datum.getFetchTime()) old.set(datum); } continue; } if (CrawlDatum.hasFetchStatus(datum)) { if (!fetchSet) { if (multiple) { fetch.set(datum); } else { fetch = datum; } fetchSet = true; } else { // always take the latest version if (fetch.getFetchTime() < datum.getFetchTime()) fetch.set(datum); } continue; } switch (datum.getStatus()) { // collect other info case CrawlDatum.STATUS_LINKED: CrawlDatum link; if (multiple) { link = new CrawlDatum(); link.set(datum); } else { link = datum; } linked.insert(link); break; case CrawlDatum.STATUS_SIGNATURE: signature = datum.getSignature(); break; case CrawlDatum.STATUS_PARSE_META: metaFromParse = datum.getMetaData(); break; default: LOG.warn("Unknown status, key: " + key + ", datum: " + datum); } } // copy the content of the queue into a List // in reversed order int numLinks = linked.size(); List<CrawlDatum> linkList = new ArrayList<CrawlDatum>(numLinks); for (int i = numLinks - 1; i >= 0; i--) { linkList.add(linked.pop()); } // if it doesn't already exist, skip it if (!oldSet && !additionsAllowed) return; // if there is no fetched datum, perhaps there is a link if (!fetchSet && linkList.size() > 0) { fetch = linkList.get(0); fetchSet = true; } // still no new data - record only unchanged old data, if exists, and return if (!fetchSet) { if (oldSet) {// at this point at least "old" should be present output.collect(key, old); reporter.getCounter("CrawlDB status", CrawlDatum.getStatusName(old.getStatus())).increment(1); } else { LOG.warn("Missing fetch and old value, signature=" + signature); } return; } if (signature == null) signature = fetch.getSignature(); long prevModifiedTime = oldSet ? old.getModifiedTime() : 0L; long prevFetchTime = oldSet ? old.getFetchTime() : 0L; // initialize with the latest version, be it fetch or link result.set(fetch); if (oldSet) { // copy metadata from old, if exists if (old.getMetaData().size() > 0) { result.putAllMetaData(old); // overlay with new, if any if (fetch.getMetaData().size() > 0) result.putAllMetaData(fetch); } // set the most recent valid value of modifiedTime if (old.getModifiedTime() > 0 && fetch.getModifiedTime() == 0) { result.setModifiedTime(old.getModifiedTime()); } } switch (fetch.getStatus()) { // determine new status case CrawlDatum.STATUS_LINKED: // it was link if (oldSet) { // if old exists result.set(old); // use it } else { result = schedule.initializeSchedule((Text) key, result); result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED); try { scfilters.initialScore((Text) key, result); } catch (ScoringFilterException e) { if (LOG.isWarnEnabled()) { LOG.warn("Cannot filter init score for url " + key + ", using default: " + e.getMessage()); } result.setScore(0.0f); } } break; case CrawlDatum.STATUS_FETCH_SUCCESS: // succesful fetch case CrawlDatum.STATUS_FETCH_REDIR_TEMP: // successful fetch, redirected case CrawlDatum.STATUS_FETCH_REDIR_PERM: case CrawlDatum.STATUS_FETCH_NOTMODIFIED: // successful fetch, notmodified // determine the modification status int modified = FetchSchedule.STATUS_UNKNOWN; if (fetch.getStatus() == CrawlDatum.STATUS_FETCH_NOTMODIFIED) { modified = FetchSchedule.STATUS_NOTMODIFIED; } else { if (oldSet && old.getSignature() != null && signature != null) { if (SignatureComparator._compare(old.getSignature(), signature) != 0) { modified = FetchSchedule.STATUS_MODIFIED; } else { modified = FetchSchedule.STATUS_NOTMODIFIED; } } } // set the schedule result = schedule.setFetchSchedule((Text) key, result, prevFetchTime, prevModifiedTime, fetch.getFetchTime(), fetch.getModifiedTime(), modified); // set the result status and signature if (modified == FetchSchedule.STATUS_NOTMODIFIED) { result.setStatus(CrawlDatum.STATUS_DB_NOTMODIFIED); if (oldSet) result.setSignature(old.getSignature()); } else { switch (fetch.getStatus()) { case CrawlDatum.STATUS_FETCH_SUCCESS: result.setStatus(CrawlDatum.STATUS_DB_FETCHED); break; case CrawlDatum.STATUS_FETCH_REDIR_PERM: result.setStatus(CrawlDatum.STATUS_DB_REDIR_PERM); break; case CrawlDatum.STATUS_FETCH_REDIR_TEMP: result.setStatus(CrawlDatum.STATUS_DB_REDIR_TEMP); break; default: LOG.warn("Unexpected status: " + fetch.getStatus() + " resetting to old status."); if (oldSet) result.setStatus(old.getStatus()); else result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED); } result.setSignature(signature); if (metaFromParse != null) { for (Entry<Writable, Writable> e : metaFromParse.entrySet()) { result.getMetaData().put(e.getKey(), e.getValue()); } } } // if fetchInterval is larger than the system-wide maximum, trigger // an unconditional recrawl. This prevents the page to be stuck at // NOTMODIFIED state, when the old fetched copy was already removed with // old segments. if (maxInterval < result.getFetchInterval()) result = schedule.forceRefetch((Text) key, result, false); break; case CrawlDatum.STATUS_SIGNATURE: if (LOG.isWarnEnabled()) { LOG.warn("Lone CrawlDatum.STATUS_SIGNATURE: " + key); } return; case CrawlDatum.STATUS_FETCH_RETRY: // temporary failure if (oldSet) { result.setSignature(old.getSignature()); // use old signature } result = schedule.setPageRetrySchedule((Text) key, result, prevFetchTime, prevModifiedTime, fetch.getFetchTime()); if (result.getRetriesSinceFetch() < retryMax) { result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED); } else { result.setStatus(CrawlDatum.STATUS_DB_GONE); } break; case CrawlDatum.STATUS_FETCH_GONE: // permanent failure if (oldSet) result.setSignature(old.getSignature()); // use old signature result.setStatus(CrawlDatum.STATUS_DB_GONE); result = schedule.setPageGoneSchedule((Text) key, result, prevFetchTime, prevModifiedTime, fetch.getFetchTime()); break; default: throw new RuntimeException("Unknown status: " + fetch.getStatus() + " " + key); } try { scfilters.updateDbScore((Text) key, oldSet ? old : null, result, linkList); } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Couldn't update score, key=" + key + ": " + e); } } // remove generation time, if any result.getMetaData().remove(Nutch.WRITABLE_GENERATE_TIME_KEY); output.collect(key, result); reporter.getCounter("CrawlDB status", CrawlDatum.getStatusName(result.getStatus())).increment(1); }