Example usage for org.apache.hadoop.io MapWritable entrySet

Introduction

In this page you can find the example usage for org.apache.hadoop.io MapWritable entrySet.

Prototype

@Override
    public Set<Map.Entry<Writable, Writable>> entrySet()

Source Link

Usage

From source file:org.apache.mahout.classifier.sequencelearning.baumwelchmapreduce.BaumWelchReducer.java

License:Apache License

@Override
protected void reduce(Text key, Iterable<MapWritable> stripes, Context context)
        throws IOException, InterruptedException {

    log.info("Entering Reducer. Key = {}", key.toString());
    MapWritable sumOfStripes = new MapWritable();
    MapWritable finalStripe = new MapWritable();
    boolean isInitial = false;
    boolean isTransit = false;
    boolean isEmit = false;
    int stateID = -1;

    if (key.charAt(0) == 'I') {
        isInitial = true;//from w w  w.java2  s  .  c  om
    } else if (key.charAt(0) == 'E') {
        isEmit = true;
        stateID = Character.getNumericValue(key.charAt(5));
    } else if (key.charAt(0) == 'T') {
        isTransit = true;
        stateID = Character.getNumericValue(key.charAt(8));
    } else {
        throw new IllegalStateException("Baum Welch Reducer Error Determining the Key Type");
    }

    if (isInitial) {
        ;
        Double[] val = new Double[nrOfHiddenStates];
        for (int i = 0; i < nrOfHiddenStates; i++) {
            val[i] = 0.0;
        }
        for (MapWritable stripe : stripes) {
            log.info("Reducer Processing Initial Distribution Stripe.");
            for (MapWritable.Entry<Writable, Writable> stripeEntry : stripe.entrySet()) {
                log.info("Reducer Getting Initial Distribution Stripe Entry. Key = {}  Value = {} ",
                        Integer.toString(((IntWritable) stripeEntry.getKey()).get()),
                        Double.toString(((DoubleWritable) stripeEntry.getValue()).get()));
                val[((IntWritable) stripeEntry.getKey()).get()] += ((DoubleWritable) stripeEntry.getValue())
                        .get();
            }
        }
        for (int i = 0; i < nrOfHiddenStates; i++) {
            log.info("Reducer adding to sumOfStripes for Initial. Key = {}  Value ={}", Integer.toString(i),
                    Double.toString(val[i]));
            sumOfStripes.put(new IntWritable(i), new DoubleWritable(val[i]));
        }
    } else if (isEmit) {
        Iterator<MapWritable> it = stripes.iterator();
        int seqlength = it.next().size();
        Double[] val = new Double[nrOfEmittedStates];
        for (int i = 0; i < nrOfEmittedStates; i++) {
            val[i] = 0.0;
        }
        for (MapWritable stripe : stripes) {
            log.info("Reducer Processing Emission Distribution Stripe.");
            for (MapWritable.Entry<Writable, Writable> stripeEntry : stripe.entrySet()) {
                log.info("Reducer Getting Emission Distribution Stripe Entry. Key = {}  Value = {} ",
                        Integer.toString(((IntWritable) stripeEntry.getKey()).get()),
                        Double.toString(((DoubleWritable) stripeEntry.getValue()).get()));
                val[((IntWritable) stripeEntry.getKey()).get()] += ((DoubleWritable) stripeEntry.getValue())
                        .get();
            }
        }
        for (int i = 0; i < nrOfEmittedStates; i++) {
            log.info("Reducer adding to sumOfStripes for Emission. Key = {}  Value ={}", Integer.toString(i),
                    Double.toString(val[i]));
            sumOfStripes.put(new IntWritable(i), new DoubleWritable(val[i]));
        }
    } else if (isTransit) {
        Double[] val = new Double[nrOfHiddenStates];
        for (int i = 0; i < nrOfHiddenStates; i++) {
            val[i] = 0.0;
        }
        for (MapWritable stripe : stripes) {
            log.info("Reducer Processing Transition Distribution Stripe.");
            for (MapWritable.Entry<Writable, Writable> stripeEntry : stripe.entrySet()) {
                log.info("Reducer Getting Transition Distribution Stripe Entry. Key = {}  Value = {} ",
                        Integer.toString(((IntWritable) stripeEntry.getKey()).get()),
                        Double.toString(((DoubleWritable) stripeEntry.getValue()).get()));
                val[((IntWritable) stripeEntry.getKey()).get()] += ((DoubleWritable) stripeEntry.getValue())
                        .get();
            }
        }
        for (int i = 0; i < nrOfHiddenStates; i++) {
            log.info("Reducer adding to sumOfStripes for Transition. Key = {}  Value ={}", Integer.toString(i),
                    Double.toString(val[i]));
            sumOfStripes.put(new IntWritable(i), new DoubleWritable(val[i]));
        }
    } else {
        throw new IllegalStateException("Baum Welch Reducer Error: Unable to aggregate distribution stripes.");
    }

    double sum = 0.0;
    for (MapWritable.Entry<Writable, Writable> sumEntry : sumOfStripes.entrySet()) {
        sum += ((DoubleWritable) sumEntry.getValue()).get();
    }

    //DoubleWritable normalizedSum = new DoubleWritable(0.0);
    //double[] innerValues = new double[sumOfStripes.size()];
    int index = 0;
    MapWritable distributionStripe = new MapWritable();
    for (MapWritable.Entry<Writable, Writable> sumEntry : sumOfStripes.entrySet()) {
        IntWritable state = (IntWritable) sumEntry.getKey();
        double innerValue = ((DoubleWritable) sumEntry.getValue()).get();
        double normalizedSum = innerValue / sum;
        //innerValues[index++] = normalizedSum;
        distributionStripe.put(state, new DoubleWritable(normalizedSum));
        //finalStripe.put(((IntWritable)sumEntry.getKey()), val);
    }

    log.info("Reducer Writing:  Key = {} Value (Stripe) Size = {}", key.toString(), finalStripe.size());
    for (MapWritable.Entry<Writable, Writable> entry : finalStripe.entrySet()) {
        log.info("Distribution Stripe Detail Key = {}, Value ={}", ((IntWritable) entry.getKey()).get(),
                ((DoubleWritable) entry.getValue()).get());
    }
    context.write(key, distributionStripe);

}

From source file:org.apache.mahout.classifier.sequencelearning.baumwelchmapreduce.BaumWelchUtils.java

License:Apache License

public static HmmModel CreateHmmModel(int nrOfHiddenStates, int nrOfOutputStates, Path modelPath,
        Configuration conf) throws IOException {

    log.info("Entering Create Hmm Model. Model Path = {}", modelPath.toUri());
    Vector initialProbabilities = new DenseVector(nrOfHiddenStates);
    Matrix transitionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfHiddenStates);
    Matrix emissionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfOutputStates);

    // Get the path location where the seq files encoding model are stored
    Path modelFilesPath = new Path(modelPath, "*");
    log.info("Create Hmm Model. ModelFiles Path = {}", modelFilesPath.toUri());
    Collection<Path> result = new ArrayList<Path>();

    // get all filtered file names in result list
    FileSystem fs = modelFilesPath.getFileSystem(conf);
    log.info("Create Hmm Model. File System = {}", fs);
    FileStatus[] matches = fs.listStatus(
            FileUtil.stat2Paths(fs.globStatus(modelFilesPath, PathFilters.partFilter())),
            PathFilters.partFilter());//from ww w.j a  va 2 s .c  o  m

    for (FileStatus match : matches) {
        log.info("CreateHmmmModel Adding File Match {}", match.getPath().toString());
        result.add(fs.makeQualified(match.getPath()));
    }

    // iterate through the result path list
    for (Path path : result) {
        for (Pair<Writable, MapWritable> pair : new SequenceFileIterable<Writable, MapWritable>(path, true,
                conf)) {
            Text key = (Text) pair.getFirst();
            log.info("CreateHmmModel Matching Seq File Key = {}", key);
            MapWritable valueMap = pair.getSecond();
            if (key.charAt(0) == 'I') {
                // initial distribution stripe
                for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) {
                    log.info("CreateHmmModel Initial Prob Adding  Key, Value  = ({} {})",
                            ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get());
                    initialProbabilities.set(((IntWritable) entry.getKey()).get(),
                            ((DoubleWritable) entry.getValue()).get());
                }
            } else if (key.charAt(0) == 'T') {
                // transition distribution stripe
                // key is of the form TRANSIT_0, TRANSIT_1 etc
                // the number after _ is the state ID at char number 11
                int stateID = Character.getNumericValue(key.charAt(8));
                log.info("CreateHmmModel stateID = key.charAt(8) = {}", stateID);
                for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) {
                    log.info("CreateHmmModel Transition Matrix ({}, {}) = {}", new Object[] { stateID,
                            ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get() });
                    transitionMatrix.set(stateID, ((IntWritable) entry.getKey()).get(),
                            ((DoubleWritable) entry.getValue()).get());
                }
            } else if (key.charAt(0) == 'E') {
                // emission distribution stripe
                // key is of the form EMIT_0, EMIT_1 etc
                // the number after _ is the state ID at char number 5
                int stateID = Character.getNumericValue(key.charAt(5));
                for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) {
                    log.info("CreateHmmModel Emission Matrix ({}, {}) = {}", new Object[] { stateID,
                            ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get() });
                    emissionMatrix.set(stateID, ((IntWritable) entry.getKey()).get(),
                            ((DoubleWritable) entry.getValue()).get());
                }
            } else {
                throw new IllegalStateException("Error creating HmmModel from Sequence File Path");
            }
        }
    }
    HmmModel model = new HmmModel(transitionMatrix, emissionMatrix, initialProbabilities);
    HmmUtils.validate(model);
    return model;
}

From source file:org.apache.mahout.classifier.sequencelearning.baumwelchmapreduce.BaumWelchUtils.java

License:Apache License

protected static void WriteModelToDirectory(HmmModel model, Path modelPath, Configuration conf)
        throws IOException {

    int numHidden = model.getNrOfHiddenStates();
    int numObserved = model.getNrOfOutputStates();
    Matrix emissionMatrix = model.getEmissionMatrix();
    Matrix transitionMatrix = model.getTransitionMatrix();
    Vector initialProbability = model.getInitialProbabilities();

    MapWritable initialDistributionMap = new MapWritable();
    MapWritable transitionDistributionMap = new MapWritable();
    MapWritable emissionDistributionMap = new MapWritable();
    // delete the output directory
    HadoopUtil.delete(conf, modelPath);//  www.j av  a 2s. c  o  m
    // create new file to store HMM
    FileSystem fs = FileSystem.get(modelPath.toUri(), conf);
    Path outFile = new Path(modelPath, "part-randomSeed");
    boolean newFile = fs.createNewFile(outFile);

    if (newFile) {
        SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, outFile, Text.class,
                MapWritable.class);

        try {

            // construct one MapWritable<IntWritable, DoubleWritable> object
            // and two MapWritable<Text, MapWritable<IntWritable, DoubleWritable >> objects
            for (int i = 0; i < numHidden; i++) {
                IntWritable initialDistributionKey = new IntWritable(i);
                DoubleWritable initialDistributionValue = new DoubleWritable(initialProbability.get(i));
                log.info("BuildRandomModel Initial Distribution Map: State {} = {})",
                        initialDistributionKey.get(), initialDistributionValue.get());
                initialDistributionMap.put(initialDistributionKey, initialDistributionValue);

                Text transitionDistributionKey = new Text("TRANSIT_" + Integer.toString(i));
                MapWritable transitionDistributionValue = new MapWritable();
                for (int j = 0; j < numHidden; j++) {
                    IntWritable transitionDistributionInnerKey = new IntWritable(j);
                    DoubleWritable transitionDistributionInnerValue = new DoubleWritable(
                            transitionMatrix.get(i, j));
                    log.info("BuildRandomModel Transition Distribution Map Inner: ({}, {}) = ({}, {})",
                            new Object[] { i, j, transitionDistributionInnerKey.get(),
                                    transitionDistributionInnerValue.get() });
                    transitionDistributionValue.put(transitionDistributionInnerKey,
                            transitionDistributionInnerValue);
                }
                transitionDistributionMap.put(transitionDistributionKey, transitionDistributionValue);

                Text emissionDistributionKey = new Text("EMIT_" + Integer.toString(i));
                MapWritable emissionDistributionValue = new MapWritable();
                for (int j = 0; j < numObserved; j++) {
                    IntWritable emissionDistributionInnerKey = new IntWritable(j);
                    DoubleWritable emissionDistributionInnerValue = new DoubleWritable(
                            emissionMatrix.get(i, j));
                    log.info("BuildRandomModel Emission Distribution Map Inner: ({}, {}) = ({}, {})",
                            new Object[] { i, j, emissionDistributionInnerKey.get(),
                                    emissionDistributionInnerValue.get() });
                    emissionDistributionValue.put(emissionDistributionInnerKey, emissionDistributionInnerValue);
                }
                emissionDistributionMap.put(emissionDistributionKey, emissionDistributionValue);

            }

            writer.append(new Text("INITIAL"), initialDistributionMap);
            log.info("Wrote random Initial Distribution Map to {}", outFile);

            for (MapWritable.Entry<Writable, Writable> transitionEntry : transitionDistributionMap.entrySet()) {
                log.info("Writing Transition Distribution Map Key, Value = ({}, {})", transitionEntry.getKey(),
                        transitionEntry.getValue());
                writer.append(transitionEntry.getKey(), transitionEntry.getValue());
            }
            log.info("Wrote random Transition Distribution Map to {}", outFile);

            for (MapWritable.Entry<Writable, Writable> emissionEntry : emissionDistributionMap.entrySet()) {
                log.info("Writing Emission Distribution Map Key, Value = ({}, {})", emissionEntry.getKey(),
                        emissionEntry.getValue());
                writer.append(emissionEntry.getKey(), emissionEntry.getValue());
            }
            log.info("Wrote random Emission Distribution Map to {}", outFile);

        } finally {
            Closeables.closeQuietly(writer);
        }

    }

}

From source file:org.apache.mahout.classifier.sequencelearning.hmm.hadoop.BaumWelchCombiner.java

License:Apache License

@Override
protected void reduce(Text key, Iterable<MapWritable> stripes, Context context)
        throws IOException, InterruptedException {

    MapWritable sumOfStripes = new MapWritable();

    if (scaling.equals("logscaling")) {
        for (MapWritable stripe : stripes) {
            for (Map.Entry e : stripe.entrySet()) {
                double val = ((DoubleWritable) e.getValue()).get();
                if (!sumOfStripes.containsKey(e.getKey())) {
                    sumOfStripes.put((IntWritable) e.getKey(), new DoubleWritable(val));
                } else {
                    double sumSripesVal = ((DoubleWritable) sumOfStripes.get(e.getKey())).get();
                    if (sumSripesVal > Double.NEGATIVE_INFINITY) {
                        val = val + Math.log(1 + Math.exp(sumSripesVal - val));
                    }//  w  w  w.j  a v  a  2 s.  c o m
                    sumOfStripes.put((IntWritable) e.getKey(), new DoubleWritable(val));
                }
            }
        }
    } else if (scaling.equals("rescaling")) {
        for (MapWritable stripe : stripes) {
            for (Map.Entry e : stripe.entrySet()) {
                if (key.charAt(0) == (int) 'I') {

                    double val = ((DoubleWritable) e.getValue()).get();
                    if (!sumOfStripes.containsKey(e.getKey())) {
                        sumOfStripes.put((IntWritable) e.getKey(), (DoubleWritable) e.getValue());
                    } else {
                        val += ((DoubleWritable) sumOfStripes.get(e.getKey())).get();
                        sumOfStripes.put((IntWritable) e.getKey(), new DoubleWritable(val));
                    }
                } else {
                    double[] pr = BaumWelchUtils.toDoublePair(((BytesWritable) e.getValue()).getBytes());
                    double num = pr[0];
                    double denom = pr[1];
                    if (!sumOfStripes.containsKey(e.getKey())) {
                        sumOfStripes.put((IntWritable) e.getKey(), (BytesWritable) e.getValue());
                    } else {
                        double[] pr1 = BaumWelchUtils
                                .toDoublePair(((BytesWritable) sumOfStripes.get(e.getKey())).getBytes());
                        num += pr1[0];
                        denom += pr1[1];
                        byte[] doublePair1 = BaumWelchUtils.doublePairToByteArray(num, denom);
                        sumOfStripes.put((IntWritable) e.getKey(), new BytesWritable(doublePair1));
                    }
                }
            }
        }
    } else {
        for (MapWritable stripe : stripes) {
            for (Map.Entry e : stripe.entrySet()) {
                double val = ((DoubleWritable) e.getValue()).get();
                if (!sumOfStripes.containsKey(e.getKey())) {
                    sumOfStripes.put((IntWritable) e.getKey(), (DoubleWritable) e.getValue());
                } else {
                    val += ((DoubleWritable) sumOfStripes.get(e.getKey())).get();
                    sumOfStripes.put((IntWritable) e.getKey(), new DoubleWritable(val));
                }
            }
        }
    }
    context.write(key, sumOfStripes);
}

From source file:org.apache.mahout.classifier.sequencelearning.hmm.hadoop.BaumWelchMapper.java

License:Apache License

@Override
public void setup(Context context) throws IOException, InterruptedException {

    super.setup(context);
    Configuration config = context.getConfiguration();

    String scalingMethod = config.get(BaumWelchConfigKeys.SCALING_OPTION_KEY);

    if (scalingMethod.equals("rescaling")) {
        scaling = HmmAlgorithms.ScalingMethod.RESCALING;
    } else if (scalingMethod.equals("logscaling")) {
        scaling = HmmAlgorithms.ScalingMethod.LOGSCALING;
    }//  ww  w .ja  v  a2 s.c  om

    nrOfHiddenStates = Integer.parseInt(config.get(BaumWelchConfigKeys.NUMBER_OF_HIDDEN_STATES_KEY));
    nrOfEmittedStates = Integer.parseInt(config.get(BaumWelchConfigKeys.NUMBER_OF_EMITTED_STATES_KEY));
    MapWritable hiddenStatesWritableMap = MapWritableCache.load(config,
            new Path(config.get(BaumWelchConfigKeys.HIDDEN_STATES_MAP_PATH)));
    MapWritable emittedStatesWritableMap = MapWritableCache.load(config,
            new Path(config.get(BaumWelchConfigKeys.EMITTED_STATES_MAP_PATH)));

    String[] hiddenStatesArray = new String[hiddenStatesWritableMap.size()];
    String[] emittedStatesArray = new String[emittedStatesWritableMap.size()];

    int k = 0;
    int l = 0;

    for (MapWritable.Entry<Writable, Writable> entry : hiddenStatesWritableMap.entrySet()) {
        hiddenStatesArray[k++] = ((entry.getKey())).toString();
    }

    for (MapWritable.Entry<Writable, Writable> entry : emittedStatesWritableMap.entrySet()) {
        emittedStatesArray[l++] = ((entry.getKey())).toString();
    }

    modelPath = new Path(config.get(BaumWelchConfigKeys.MODEL_PATH_KEY));
    Model = BaumWelchUtils.createHmmModel(nrOfHiddenStates, nrOfEmittedStates, modelPath, config);
    Model.registerHiddenStateNames(hiddenStatesArray);
    Model.registerOutputStateNames(emittedStatesArray);
    HmmUtils.normalizeModel(Model);
    HmmUtils.validate(Model);

    log.info("Mapper Setup Hmm Model Created. Hidden States = {} Emitted States = {}",
            Model.getNrOfHiddenStates(), Model.getNrOfOutputStates());

}

From source file:org.apache.mahout.classifier.sequencelearning.hmm.hadoop.BaumWelchReducer.java

License:Apache License

@Override
protected void reduce(Text key, Iterable<MapWritable> stripes, Context context)
        throws IOException, InterruptedException {

    MapWritable sumOfStripes = new MapWritable();

    // Finish the Expectation Step by aggregating all posterior probabilities for one key
    if (scaling.equals("logscaling")) {
        double totalValSum = Double.NEGATIVE_INFINITY;
        for (MapWritable stripe : stripes) {
            for (Map.Entry e : stripe.entrySet()) {
                double val = ((DoubleWritable) e.getValue()).get();
                double max = totalValSum > val ? totalValSum : val;
                totalValSum = max + Math.log(Math.exp(totalValSum - max) + Math.exp(val - max));
                if (!sumOfStripes.containsKey(e.getKey())) {
                    sumOfStripes.put((IntWritable) e.getKey(), new DoubleWritable(val));
                } else {
                    double sumSripesVal = ((DoubleWritable) sumOfStripes.get(e.getKey())).get();
                    if (sumSripesVal > Double.NEGATIVE_INFINITY) {
                        val = val + Math.log(1 + Math.exp(sumSripesVal - val));
                    }//from  w  w  w. jav  a 2s .co  m
                    sumOfStripes.put((IntWritable) e.getKey(), new DoubleWritable(val));
                }
            }
        }

        //normalize the aggregate
        for (Map.Entry e : sumOfStripes.entrySet()) {
            double val = ((DoubleWritable) e.getValue()).get();
            if (totalValSum > Double.NEGATIVE_INFINITY) {
                val = val - totalValSum;
            }
            sumOfStripes.put((IntWritable) e.getKey(), new DoubleWritable(Math.exp(val)));
        }
    } else if (scaling.equals("rescaling")) {
        double totalValSum = 0.0;

        for (MapWritable stripe : stripes) {
            for (Map.Entry e : stripe.entrySet()) {
                if (key.charAt(0) == (int) 'I') {
                    double val = ((DoubleWritable) e.getValue()).get();
                    totalValSum += val;
                    if (!sumOfStripes.containsKey(e.getKey())) {
                        sumOfStripes.put((IntWritable) e.getKey(), (DoubleWritable) e.getValue());
                    } else {
                        val += ((DoubleWritable) sumOfStripes.get(e.getKey())).get();
                        sumOfStripes.put((IntWritable) e.getKey(), new DoubleWritable(val));
                    }
                } else {
                    double[] pr = BaumWelchUtils.toDoublePair(((BytesWritable) e.getValue()).getBytes());
                    double num = pr[0];
                    double denom = pr[1];
                    if (!sumOfStripes.containsKey(e.getKey())) {
                        sumOfStripes.put((IntWritable) e.getKey(), (BytesWritable) e.getValue());
                    } else {
                        double[] pr1 = BaumWelchUtils
                                .toDoublePair(((BytesWritable) sumOfStripes.get(e.getKey())).getBytes());
                        num += pr1[0];
                        denom += pr1[1];
                        byte[] doublePair1 = BaumWelchUtils.doublePairToByteArray(num, denom);
                        sumOfStripes.put((IntWritable) e.getKey(), new BytesWritable(doublePair1));
                    }
                }
            }
        }

        if (key.charAt(0) == (int) 'I') {
            //normalize the aggregate
            for (Map.Entry e : sumOfStripes.entrySet()) {
                double val = ((DoubleWritable) e.getValue()).get();
                if (totalValSum > 0) {
                    val /= totalValSum;
                }
                sumOfStripes.put((IntWritable) e.getKey(), new DoubleWritable(val));
            }

        } else {
            // compute the probabilities
            for (Map.Entry e : sumOfStripes.entrySet()) {
                double[] pr1 = BaumWelchUtils
                        .toDoublePair(((BytesWritable) sumOfStripes.get(e.getKey())).getBytes());
                sumOfStripes.put((IntWritable) e.getKey(), new DoubleWritable(pr1[0] / pr1[1]));
            }
        }
    } else {
        double totalValSum = 0.0;

        for (MapWritable stripe : stripes) {
            for (Map.Entry e : stripe.entrySet()) {
                int state = ((IntWritable) e.getKey()).get();
                double val = ((DoubleWritable) e.getValue()).get();
                totalValSum += val;
                if (!sumOfStripes.containsKey(e.getKey())) {
                    sumOfStripes.put((IntWritable) e.getKey(), (DoubleWritable) e.getValue());
                } else {
                    val += ((DoubleWritable) sumOfStripes.get(e.getKey())).get();
                    sumOfStripes.put((IntWritable) e.getKey(), new DoubleWritable(val));
                }
            }
        }

        //normalize the aggregate
        for (Map.Entry e : sumOfStripes.entrySet()) {
            double val = ((DoubleWritable) e.getValue()).get();
            if (totalValSum > 0) {
                val /= totalValSum;
            }
            sumOfStripes.put((IntWritable) e.getKey(), new DoubleWritable(val));
        }
    }

    //Write the distribution parameter vector to HDFS for the next iteration
    context.write(key, sumOfStripes);

}

From source file:org.apache.mahout.classifier.sequencelearning.hmm.hadoop.BaumWelchUtils.java

License:Apache License

/**
 * Converts the sequence files present in a directory to a {@link HmmModel} model.
 *
 * @param nrOfHiddenStates Number of hidden states
 * @param nrOfOutputStates Number of output states
 * @param modelPath        Location of the sequence files containing the model's distributions
 * @param conf             Configuration object
 * @return HmmModel the encoded model/*from w  w w.jav a  2 s  .c  o  m*/
 * @throws IOException
 */
public static HmmModel createHmmModel(int nrOfHiddenStates, int nrOfOutputStates, Path modelPath,
        Configuration conf) throws IOException {

    log.info("Entering Create Hmm Model. Model Path = {}", modelPath.toUri());
    Vector initialProbabilities = new DenseVector(nrOfHiddenStates);
    Matrix transitionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfHiddenStates);
    Matrix emissionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfOutputStates);

    // Get the path location where the seq files encoding model are stored
    Path modelFilesPath = new Path(modelPath, "*");

    Collection<Path> result = new ArrayList<Path>();

    // get all filtered file names in result list
    FileSystem fs = modelFilesPath.getFileSystem(conf);
    FileStatus[] matches = fs.listStatus(
            FileUtil.stat2Paths(fs.globStatus(modelFilesPath, PathFilters.partFilter())),
            PathFilters.partFilter());

    for (FileStatus match : matches) {
        result.add(fs.makeQualified(match.getPath()));
    }

    // iterate through the result path list
    for (Path path : result) {
        for (Pair<Writable, MapWritable> pair : new SequenceFileIterable<Writable, MapWritable>(path, true,
                conf)) {
            Text key = (Text) pair.getFirst();
            MapWritable valueMap = pair.getSecond();
            if (key.charAt(0) == (int) 'I') {
                // initial distribution stripe
                for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) {
                    initialProbabilities.set(((IntWritable) entry.getKey()).get(),
                            ((DoubleWritable) entry.getValue()).get());
                }
            } else if (key.charAt(0) == (int) 'T') {
                // transition distribution stripe
                // key is of the form TRANSIT_0, TRANSIT_1 etc
                int stateID = Integer.parseInt(key.toString().split("_")[1]);
                for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) {
                    transitionMatrix.set(stateID, ((IntWritable) entry.getKey()).get(),
                            ((DoubleWritable) entry.getValue()).get());
                }
            } else if (key.charAt(0) == (int) 'E') {
                // emission distribution stripe
                // key is of the form EMIT_0, EMIT_1 etc
                int stateID = Integer.parseInt(key.toString().split("_")[1]);
                for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) {
                    emissionMatrix.set(stateID, ((IntWritable) entry.getKey()).get(),
                            ((DoubleWritable) entry.getValue()).get());
                }
            } else {
                throw new IllegalStateException("Error creating HmmModel from Sequence File Path");
            }
        }
    }

    HmmModel model = new HmmModel(transitionMatrix, emissionMatrix, initialProbabilities);

    if (model != null) {
        return model;
    } else
        throw new IOException("Error building model from output location");

}

From source file:org.apache.mahout.classifier.sequencelearning.hmm.hadoop.BaumWelchUtils.java

License:Apache License

/**
 * Encodes a particular HmmModel as a Sequence File and write it to the specified location.
 *
 * @param model     HmmModel to be encoded
 * @param modelPath Location to store the encoded model
 * @param conf      Configuration object
 * @throws IOException/*w w w  . ja  v a2 s .  c om*/
 */

protected static void writeModelToDirectory(HmmModel model, Path modelPath, Configuration conf)
        throws IOException {

    int numHidden = model.getNrOfHiddenStates();
    int numObserved = model.getNrOfOutputStates();
    Matrix emissionMatrix = model.getEmissionMatrix();
    Matrix transitionMatrix = model.getTransitionMatrix();
    Vector initialProbability = model.getInitialProbabilities();

    MapWritable initialDistributionMap = new MapWritable();
    MapWritable transitionDistributionMap = new MapWritable();
    MapWritable emissionDistributionMap = new MapWritable();
    // delete the output directory
    HadoopUtil.delete(conf, modelPath);
    // create new file to store HMM
    FileSystem fs = FileSystem.get(modelPath.toUri(), conf);
    Path outFile = new Path(modelPath, "part-randomSeed");
    boolean newFile = fs.createNewFile(outFile);

    if (newFile) {
        SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, outFile, Text.class,
                MapWritable.class);

        try {
            for (int i = 0; i < numHidden; i++) {
                IntWritable initialDistributionKey = new IntWritable(i);
                DoubleWritable initialDistributionValue = new DoubleWritable(initialProbability.get(i));
                initialDistributionMap.put(initialDistributionKey, initialDistributionValue);

                Text transitionDistributionKey = new Text("TRANSIT_" + Integer.toString(i));
                MapWritable transitionDistributionValue = new MapWritable();
                for (int j = 0; j < numHidden; j++) {
                    IntWritable transitionDistributionInnerKey = new IntWritable(j);
                    DoubleWritable transitionDistributionInnerValue = new DoubleWritable(
                            transitionMatrix.get(i, j));
                    transitionDistributionValue.put(transitionDistributionInnerKey,
                            transitionDistributionInnerValue);
                }
                transitionDistributionMap.put(transitionDistributionKey, transitionDistributionValue);

                Text emissionDistributionKey = new Text("EMIT_" + Integer.toString(i));
                MapWritable emissionDistributionValue = new MapWritable();
                for (int j = 0; j < numObserved; j++) {
                    IntWritable emissionDistributionInnerKey = new IntWritable(j);
                    DoubleWritable emissionDistributionInnerValue = new DoubleWritable(
                            emissionMatrix.get(i, j));
                    emissionDistributionValue.put(emissionDistributionInnerKey, emissionDistributionInnerValue);
                }
                emissionDistributionMap.put(emissionDistributionKey, emissionDistributionValue);
            }

            writer.append(new Text("INITIAL"), initialDistributionMap);
            log.info("Wrote random Initial Distribution Map to {}", outFile);
            for (MapWritable.Entry<Writable, Writable> transitionEntry : transitionDistributionMap.entrySet()) {

                writer.append(transitionEntry.getKey(), transitionEntry.getValue());
            }
            log.info("Wrote random Transition Distribution Map to {}", outFile);

            for (MapWritable.Entry<Writable, Writable> emissionEntry : emissionDistributionMap.entrySet()) {
                writer.append(emissionEntry.getKey(), emissionEntry.getValue());
            }
            log.info("Wrote random Emission Distribution Map to {}", outFile);

        } finally {
            Closeables.closeQuietly(writer);
        }

    }

}

From source file:org.apache.nutch.crawl.CrawlDatum.java

License:Apache License

private boolean metadataEquals(org.apache.hadoop.io.MapWritable otherMetaData) {
    if (metaData == null || metaData.size() == 0) {
        return otherMetaData == null || otherMetaData.size() == 0;
    }/*from w ww. j  av a 2  s .  com*/
    if (otherMetaData == null) {
        // we already know that the current object is not null or empty
        return false;
    }
    HashSet<Entry<Writable, Writable>> set1 = new HashSet<Entry<Writable, Writable>>(metaData.entrySet());
    HashSet<Entry<Writable, Writable>> set2 = new HashSet<Entry<Writable, Writable>>(otherMetaData.entrySet());
    return set1.equals(set2);
}

From source file:org.apache.nutch.crawl.CrawlDbReducer.java

License:Apache License

public void reduce(Text key, Iterator<CrawlDatum> values, OutputCollector<Text, CrawlDatum> output,
        Reporter reporter) throws IOException {

    CrawlDatum fetch = new CrawlDatum();
    CrawlDatum old = new CrawlDatum();

    boolean fetchSet = false;
    boolean oldSet = false;
    byte[] signature = null;
    boolean multiple = false; // avoid deep copy when only single value exists
    linked.clear();/* w w  w  .  j ava2s  .co m*/
    org.apache.hadoop.io.MapWritable metaFromParse = null;

    while (values.hasNext()) {
        CrawlDatum datum = (CrawlDatum) values.next();
        if (!multiple && values.hasNext())
            multiple = true;
        if (CrawlDatum.hasDbStatus(datum)) {
            if (!oldSet) {
                if (multiple) {
                    old.set(datum);
                } else {
                    // no need for a deep copy - this is the only value
                    old = datum;
                }
                oldSet = true;
            } else {
                // always take the latest version
                if (old.getFetchTime() < datum.getFetchTime())
                    old.set(datum);
            }
            continue;
        }

        if (CrawlDatum.hasFetchStatus(datum)) {
            if (!fetchSet) {
                if (multiple) {
                    fetch.set(datum);
                } else {
                    fetch = datum;
                }
                fetchSet = true;
            } else {
                // always take the latest version
                if (fetch.getFetchTime() < datum.getFetchTime())
                    fetch.set(datum);
            }
            continue;
        }

        switch (datum.getStatus()) { // collect other info
        case CrawlDatum.STATUS_LINKED:
            CrawlDatum link;
            if (multiple) {
                link = new CrawlDatum();
                link.set(datum);
            } else {
                link = datum;
            }
            linked.insert(link);
            break;
        case CrawlDatum.STATUS_SIGNATURE:
            signature = datum.getSignature();
            break;
        case CrawlDatum.STATUS_PARSE_META:
            metaFromParse = datum.getMetaData();
            break;
        default:
            LOG.warn("Unknown status, key: " + key + ", datum: " + datum);
        }
    }

    // copy the content of the queue into a List
    // in reversed order
    int numLinks = linked.size();
    List<CrawlDatum> linkList = new ArrayList<CrawlDatum>(numLinks);
    for (int i = numLinks - 1; i >= 0; i--) {
        linkList.add(linked.pop());
    }

    // if it doesn't already exist, skip it
    if (!oldSet && !additionsAllowed)
        return;

    // if there is no fetched datum, perhaps there is a link
    if (!fetchSet && linkList.size() > 0) {
        fetch = linkList.get(0);
        fetchSet = true;
    }

    // still no new data - record only unchanged old data, if exists, and return
    if (!fetchSet) {
        if (oldSet) {// at this point at least "old" should be present
            output.collect(key, old);
            reporter.getCounter("CrawlDB status", CrawlDatum.getStatusName(old.getStatus())).increment(1);
        } else {
            LOG.warn("Missing fetch and old value, signature=" + signature);
        }
        return;
    }

    if (signature == null)
        signature = fetch.getSignature();
    long prevModifiedTime = oldSet ? old.getModifiedTime() : 0L;
    long prevFetchTime = oldSet ? old.getFetchTime() : 0L;

    // initialize with the latest version, be it fetch or link
    result.set(fetch);
    if (oldSet) {
        // copy metadata from old, if exists
        if (old.getMetaData().size() > 0) {
            result.putAllMetaData(old);
            // overlay with new, if any
            if (fetch.getMetaData().size() > 0)
                result.putAllMetaData(fetch);
        }
        // set the most recent valid value of modifiedTime
        if (old.getModifiedTime() > 0 && fetch.getModifiedTime() == 0) {
            result.setModifiedTime(old.getModifiedTime());
        }
    }

    switch (fetch.getStatus()) { // determine new status

    case CrawlDatum.STATUS_LINKED: // it was link
        if (oldSet) { // if old exists
            result.set(old); // use it
        } else {
            result = schedule.initializeSchedule((Text) key, result);
            result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
            try {
                scfilters.initialScore((Text) key, result);
            } catch (ScoringFilterException e) {
                if (LOG.isWarnEnabled()) {
                    LOG.warn("Cannot filter init score for url " + key + ", using default: " + e.getMessage());
                }
                result.setScore(0.0f);
            }
        }
        break;

    case CrawlDatum.STATUS_FETCH_SUCCESS: // succesful fetch
    case CrawlDatum.STATUS_FETCH_REDIR_TEMP: // successful fetch, redirected
    case CrawlDatum.STATUS_FETCH_REDIR_PERM:
    case CrawlDatum.STATUS_FETCH_NOTMODIFIED: // successful fetch, notmodified
        // determine the modification status
        int modified = FetchSchedule.STATUS_UNKNOWN;
        if (fetch.getStatus() == CrawlDatum.STATUS_FETCH_NOTMODIFIED) {
            modified = FetchSchedule.STATUS_NOTMODIFIED;
        } else {
            if (oldSet && old.getSignature() != null && signature != null) {
                if (SignatureComparator._compare(old.getSignature(), signature) != 0) {
                    modified = FetchSchedule.STATUS_MODIFIED;
                } else {
                    modified = FetchSchedule.STATUS_NOTMODIFIED;
                }
            }
        }
        // set the schedule
        result = schedule.setFetchSchedule((Text) key, result, prevFetchTime, prevModifiedTime,
                fetch.getFetchTime(), fetch.getModifiedTime(), modified);
        // set the result status and signature
        if (modified == FetchSchedule.STATUS_NOTMODIFIED) {
            result.setStatus(CrawlDatum.STATUS_DB_NOTMODIFIED);
            if (oldSet)
                result.setSignature(old.getSignature());
        } else {
            switch (fetch.getStatus()) {
            case CrawlDatum.STATUS_FETCH_SUCCESS:
                result.setStatus(CrawlDatum.STATUS_DB_FETCHED);
                break;
            case CrawlDatum.STATUS_FETCH_REDIR_PERM:
                result.setStatus(CrawlDatum.STATUS_DB_REDIR_PERM);
                break;
            case CrawlDatum.STATUS_FETCH_REDIR_TEMP:
                result.setStatus(CrawlDatum.STATUS_DB_REDIR_TEMP);
                break;
            default:
                LOG.warn("Unexpected status: " + fetch.getStatus() + " resetting to old status.");
                if (oldSet)
                    result.setStatus(old.getStatus());
                else
                    result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
            }
            result.setSignature(signature);
            if (metaFromParse != null) {
                for (Entry<Writable, Writable> e : metaFromParse.entrySet()) {
                    result.getMetaData().put(e.getKey(), e.getValue());
                }
            }
        }
        // if fetchInterval is larger than the system-wide maximum, trigger
        // an unconditional recrawl. This prevents the page to be stuck at
        // NOTMODIFIED state, when the old fetched copy was already removed with
        // old segments.
        if (maxInterval < result.getFetchInterval())
            result = schedule.forceRefetch((Text) key, result, false);
        break;
    case CrawlDatum.STATUS_SIGNATURE:
        if (LOG.isWarnEnabled()) {
            LOG.warn("Lone CrawlDatum.STATUS_SIGNATURE: " + key);
        }
        return;
    case CrawlDatum.STATUS_FETCH_RETRY: // temporary failure
        if (oldSet) {
            result.setSignature(old.getSignature()); // use old signature
        }
        result = schedule.setPageRetrySchedule((Text) key, result, prevFetchTime, prevModifiedTime,
                fetch.getFetchTime());
        if (result.getRetriesSinceFetch() < retryMax) {
            result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
        } else {
            result.setStatus(CrawlDatum.STATUS_DB_GONE);
        }
        break;

    case CrawlDatum.STATUS_FETCH_GONE: // permanent failure
        if (oldSet)
            result.setSignature(old.getSignature()); // use old signature
        result.setStatus(CrawlDatum.STATUS_DB_GONE);
        result = schedule.setPageGoneSchedule((Text) key, result, prevFetchTime, prevModifiedTime,
                fetch.getFetchTime());
        break;

    default:
        throw new RuntimeException("Unknown status: " + fetch.getStatus() + " " + key);
    }

    try {
        scfilters.updateDbScore((Text) key, oldSet ? old : null, result, linkList);
    } catch (Exception e) {
        if (LOG.isWarnEnabled()) {
            LOG.warn("Couldn't update score, key=" + key + ": " + e);
        }
    }
    // remove generation time, if any
    result.getMetaData().remove(Nutch.WRITABLE_GENERATE_TIME_KEY);
    output.collect(key, result);
    reporter.getCounter("CrawlDB status", CrawlDatum.getStatusName(result.getStatus())).increment(1);
}