Example usage for org.apache.hadoop.io DefaultStringifier toString

List of usage examples for org.apache.hadoop.io DefaultStringifier toString

Introduction

In this page you can find the example usage for org.apache.hadoop.io DefaultStringifier toString.

Prototype

@Override
    public String toString(T obj) throws IOException 

Source Link

Usage

From source file:org.apache.mahout.classifier.bayes.common.BayesTfIdfMapper.java

License:Apache License

@Override
public void configure(JobConf job) {
    try {/*from w  ww.j  a va 2  s. c  om*/
        if (labelDocumentCounts == null) {
            labelDocumentCounts = new HashMap<String, Double>();

            DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(
                    job, GenericsUtil.getClass(labelDocumentCounts));

            String labelDocumentCountString = mapStringifier.toString(labelDocumentCounts);
            labelDocumentCountString = job.get("cnaivebayes.labelDocumentCounts", labelDocumentCountString);

            labelDocumentCounts = mapStringifier.fromString(labelDocumentCountString);
        }
    } catch (IOException ex) {
        log.warn(ex.toString(), ex);
    }
}

From source file:org.apache.mahout.classifier.bayes.mapreduce.bayes.BayesThetaNormalizerDriver.java

License:Apache License

@Override
public void runJob(Path input, Path output, BayesParameters params) throws IOException {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(BayesThetaNormalizerDriver.class);

    conf.setJobName("Bayes Theta Normalizer Driver running over input: " + input);

    conf.setOutputKeyClass(StringTuple.class);
    conf.setOutputValueClass(DoubleWritable.class);
    FileInputFormat.addInputPath(conf, new Path(output, "trainer-tfIdf/trainer-tfIdf"));
    Path outPath = new Path(output, "trainer-thetaNormalizer");
    FileOutputFormat.setOutputPath(conf, outPath);
    // conf.setNumMapTasks(100);
    // conf.setNumReduceTasks(1);
    conf.setMapperClass(BayesThetaNormalizerMapper.class);
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setCombinerClass(BayesThetaNormalizerReducer.class);
    conf.setReducerClass(BayesThetaNormalizerReducer.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    // Dont ever forget this. People should keep track of how hadoop conf
    // parameters and make or break a piece of code

    HadoopUtil.overwriteOutput(outPath);
    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);

    Path sigmaKFiles = new Path(output, "trainer-weights/Sigma_k/*");
    Map<String, Double> labelWeightSum = SequenceFileModelReader.readLabelSums(dfs, sigmaKFiles, conf);
    DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(conf,
            GenericsUtil.getClass(labelWeightSum));
    String labelWeightSumString = mapStringifier.toString(labelWeightSum);

    log.info("Sigma_k for Each Label");
    Map<String, Double> c = mapStringifier.fromString(labelWeightSumString);
    log.info("{}", c);
    conf.set("cnaivebayes.sigma_k", labelWeightSumString);

    Path sigmaJSigmaKFile = new Path(output, "trainer-weights/Sigma_kSigma_j/*");
    double sigmaJSigmaK = SequenceFileModelReader.readSigmaJSigmaK(dfs, sigmaJSigmaKFile, conf);
    DefaultStringifier<Double> stringifier = new DefaultStringifier<Double>(conf, Double.class);
    String sigmaJSigmaKString = stringifier.toString(sigmaJSigmaK);

    log.info("Sigma_kSigma_j for each Label and for each Features");
    double retSigmaJSigmaK = stringifier.fromString(sigmaJSigmaKString);
    log.info("{}", retSigmaJSigmaK);
    conf.set("cnaivebayes.sigma_jSigma_k", sigmaJSigmaKString);

    Path vocabCountFile = new Path(output, "trainer-tfIdf/trainer-vocabCount/*");
    double vocabCount = SequenceFileModelReader.readVocabCount(dfs, vocabCountFile, conf);
    String vocabCountString = stringifier.toString(vocabCount);

    log.info("Vocabulary Count");
    conf.set("cnaivebayes.vocabCount", vocabCountString);
    double retvocabCount = stringifier.fromString(vocabCountString);
    log.info("{}", retvocabCount);
    conf.set("bayes.parameters", params.toString());
    conf.set("output.table", output.toString());
    client.setConf(conf);//from  w  w  w .  j a  v  a 2s.  co m

    JobClient.runJob(conf);

}

From source file:org.apache.mahout.classifier.bayes.mapreduce.bayes.BayesThetaNormalizerMapper.java

License:Apache License

@Override
public void configure(JobConf job) {
    try {//from w  ww . jav a 2s .  c om
        labelWeightSum.clear();
        Map<String, Double> labelWeightSumTemp = new HashMap<String, Double>();

        DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(
                job, GenericsUtil.getClass(labelWeightSumTemp));

        String labelWeightSumString = job.get("cnaivebayes.sigma_k",
                mapStringifier.toString(labelWeightSumTemp));
        labelWeightSumTemp = mapStringifier.fromString(labelWeightSumString);
        for (Map.Entry<String, Double> stringDoubleEntry : labelWeightSumTemp.entrySet()) {
            this.labelWeightSum.put(stringDoubleEntry.getKey(), stringDoubleEntry.getValue());
        }
        DefaultStringifier<Double> stringifier = new DefaultStringifier<Double>(job,
                GenericsUtil.getClass(sigmaJSigmaK));
        String sigmaJSigmaKString = job.get("cnaivebayes.sigma_jSigma_k", stringifier.toString(sigmaJSigmaK));
        sigmaJSigmaK = stringifier.fromString(sigmaJSigmaKString);

        String vocabCountString = stringifier.toString(vocabCount);
        vocabCountString = job.get("cnaivebayes.vocabCount", vocabCountString);
        vocabCount = stringifier.fromString(vocabCountString);

        Parameters params = Parameters.fromString(job.get("bayes.parameters", ""));
        alphaI = Double.valueOf(params.get("alpha_i", "1.0"));

    } catch (IOException ex) {
        log.warn(ex.toString(), ex);
    }
}

From source file:org.apache.mahout.classifier.bayes.mapreduce.cbayes.CBayesThetaNormalizerDriver.java

License:Apache License

@Override
public void runJob(Path input, Path output, BayesParameters params) throws IOException {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(CBayesThetaNormalizerDriver.class);
    conf.setJobName("Complementary Bayes Theta Normalizer Driver running over input: " + input);

    conf.setOutputKeyClass(StringTuple.class);
    conf.setOutputValueClass(DoubleWritable.class);
    FileInputFormat.addInputPath(conf, new Path(output, "trainer-weights/Sigma_j"));
    FileInputFormat.addInputPath(conf, new Path(output, "trainer-tfIdf/trainer-tfIdf"));
    Path outPath = new Path(output, "trainer-thetaNormalizer");
    FileOutputFormat.setOutputPath(conf, outPath);
    // conf.setNumMapTasks(100);
    // conf.setNumReduceTasks(1);
    conf.setMapperClass(CBayesThetaNormalizerMapper.class);
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setCombinerClass(CBayesThetaNormalizerReducer.class);
    conf.setReducerClass(CBayesThetaNormalizerReducer.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.set("io.serializations",
            "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
    // Dont ever forget this. People should keep track of how hadoop conf
    // parameters and make or break a piece of code

    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
    HadoopUtil.overwriteOutput(outPath);

    Path sigmaKFiles = new Path(output, "trainer-weights/Sigma_k/*");
    Map<String, Double> labelWeightSum = SequenceFileModelReader.readLabelSums(dfs, sigmaKFiles, conf);
    DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(conf,
            GenericsUtil.getClass(labelWeightSum));
    String labelWeightSumString = mapStringifier.toString(labelWeightSum);

    log.info("Sigma_k for Each Label");
    Map<String, Double> c = mapStringifier.fromString(labelWeightSumString);
    log.info("{}", c);
    conf.set("cnaivebayes.sigma_k", labelWeightSumString);

    Path sigmaKSigmaJFile = new Path(output, "trainer-weights/Sigma_kSigma_j/*");
    double sigmaJSigmaK = SequenceFileModelReader.readSigmaJSigmaK(dfs, sigmaKSigmaJFile, conf);
    DefaultStringifier<Double> stringifier = new DefaultStringifier<Double>(conf, Double.class);
    String sigmaJSigmaKString = stringifier.toString(sigmaJSigmaK);

    log.info("Sigma_kSigma_j for each Label and for each Features");
    double retSigmaJSigmaK = stringifier.fromString(sigmaJSigmaKString);
    log.info("{}", retSigmaJSigmaK);
    conf.set("cnaivebayes.sigma_jSigma_k", sigmaJSigmaKString);

    Path vocabCountFile = new Path(output, "trainer-tfIdf/trainer-vocabCount/*");
    double vocabCount = SequenceFileModelReader.readVocabCount(dfs, vocabCountFile, conf);
    String vocabCountString = stringifier.toString(vocabCount);

    log.info("Vocabulary Count");
    conf.set("cnaivebayes.vocabCount", vocabCountString);
    double retvocabCount = stringifier.fromString(vocabCountString);
    log.info("{}", retvocabCount);
    conf.set("bayes.parameters", params.toString());
    conf.set("output.table", output.toString());
    client.setConf(conf);//w w  w. ja v  a2s .  c om

    JobClient.runJob(conf);

}

From source file:org.apache.mahout.classifier.bayes.mapreduce.cbayes.CBayesThetaNormalizerMapper.java

License:Apache License

@Override
public void configure(JobConf job) {
    try {//from   w  w w. j ava 2  s .c  o  m
        labelWeightSum.clear();
        Map<String, Double> labelWeightSumTemp = new HashMap<String, Double>();

        DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(
                job, GenericsUtil.getClass(labelWeightSumTemp));

        String labelWeightSumString = job.get("cnaivebayes.sigma_k",
                mapStringifier.toString(labelWeightSumTemp));
        labelWeightSumTemp = mapStringifier.fromString(labelWeightSumString);
        for (Map.Entry<String, Double> stringDoubleEntry : labelWeightSumTemp.entrySet()) {
            this.labelWeightSum.put(stringDoubleEntry.getKey(), stringDoubleEntry.getValue());
        }

        DefaultStringifier<Double> stringifier = new DefaultStringifier<Double>(job,
                GenericsUtil.getClass(sigmaJSigmaK));
        String sigmaJSigmaKString = job.get("cnaivebayes.sigma_jSigma_k", stringifier.toString(sigmaJSigmaK));
        sigmaJSigmaK = stringifier.fromString(sigmaJSigmaKString);

        String vocabCountString = job.get("cnaivebayes.vocabCount", stringifier.toString(vocabCount));
        vocabCount = stringifier.fromString(vocabCountString);

        Parameters params = Parameters.fromString(job.get("bayes.parameters", ""));
        alphaI = Double.valueOf(params.get("alpha_i", "1.0"));

    } catch (IOException ex) {
        log.warn(ex.toString(), ex);
    }
}

From source file:org.apache.mahout.classifier.bayes.mapreduce.common.BayesTfIdfDriver.java

License:Apache License

@Override
public void runJob(Path input, Path output, BayesParameters params) throws IOException {

    Configurable client = new JobClient();
    JobConf conf = new JobConf(BayesWeightSummerDriver.class);
    conf.setJobName("TfIdf Driver running over input: " + input);

    conf.setOutputKeyClass(StringTuple.class);
    conf.setOutputValueClass(DoubleWritable.class);

    FileInputFormat.addInputPath(conf, new Path(output, "trainer-termDocCount"));
    FileInputFormat.addInputPath(conf, new Path(output, "trainer-wordFreq"));
    FileInputFormat.addInputPath(conf, new Path(output, "trainer-featureCount"));
    Path outPath = new Path(output, "trainer-tfIdf");
    FileOutputFormat.setOutputPath(conf, outPath);

    // conf.setNumMapTasks(100);

    conf.setJarByClass(BayesTfIdfDriver.class);

    conf.setMapperClass(BayesTfIdfMapper.class);
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setCombinerClass(BayesTfIdfReducer.class);

    conf.setReducerClass(BayesTfIdfReducer.class);

    conf.setOutputFormat(BayesTfIdfOutputFormat.class);

    conf.set("io.serializations",
            "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
    // Dont ever forget this. People should keep track of how hadoop conf
    // parameters and make or break a piece of code

    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
    HadoopUtil.overwriteOutput(outPath);

    Path interimFile = new Path(output, "trainer-docCount/part-*");

    Map<String, Double> labelDocumentCounts = SequenceFileModelReader.readLabelDocumentCounts(dfs, interimFile,
            conf);/* w ww  .jav a  2 s .c o m*/

    DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(conf,
            GenericsUtil.getClass(labelDocumentCounts));

    String labelDocumentCountString = mapStringifier.toString(labelDocumentCounts);
    log.info("Counts of documents in Each Label");
    Map<String, Double> c = mapStringifier.fromString(labelDocumentCountString);
    log.info("{}", c);

    conf.set("cnaivebayes.labelDocumentCounts", labelDocumentCountString);
    log.info(params.print());
    if (params.get("dataSource").equals("hbase")) {
        String tableName = output.toString();
        HBaseConfiguration hc = new HBaseConfiguration(new Configuration());
        HTableDescriptor ht = new HTableDescriptor(tableName);
        HColumnDescriptor hcd = new HColumnDescriptor(BayesConstants.HBASE_COLUMN_FAMILY + ':');
        hcd.setBloomfilter(true);
        hcd.setInMemory(true);
        hcd.setMaxVersions(1);
        hcd.setBlockCacheEnabled(true);
        ht.addFamily(hcd);

        log.info("Connecting to hbase...");
        HBaseAdmin hba = new HBaseAdmin(hc);
        log.info("Creating Table {}", output);

        if (hba.tableExists(tableName)) {
            hba.disableTable(tableName);
            hba.deleteTable(tableName);
            hba.majorCompact(".META.");
        }
        hba.createTable(ht);
        conf.set("output.table", tableName);
    }
    conf.set("bayes.parameters", params.toString());

    client.setConf(conf);

    JobClient.runJob(conf);
}

From source file:org.apache.mahout.classifier.bayes.mapreduce.common.BayesTfIdfMapper.java

License:Apache License

@Override
public void configure(JobConf job) {
    try {/*from   w w  w. j a  va  2  s.c  o m*/
        this.labelDocumentCounts.clear();
        Map<String, Double> labelDocCountTemp = new HashMap<String, Double>();

        DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(
                job, GenericsUtil.getClass(labelDocCountTemp));

        String labelDocumentCountString = job.get("cnaivebayes.labelDocumentCounts",
                mapStringifier.toString(labelDocCountTemp));

        labelDocCountTemp = mapStringifier.fromString(labelDocumentCountString);
        for (Map.Entry<String, Double> stringDoubleEntry : labelDocCountTemp.entrySet()) {
            this.labelDocumentCounts.put(stringDoubleEntry.getKey(), stringDoubleEntry.getValue());
        }

    } catch (IOException ex) {
        log.warn(ex.toString(), ex);
    }
}

From source file:org.apache.mahout.classifier.bayes.WikipediaDatasetCreatorDriver.java

License:Apache License

/**
 * Run the job/*from  w  w w .j a v a 2  s  .co m*/
 * 
 * @param input
 *          the input pathname String
 * @param output
 *          the output pathname String
 * @param catFile
 *          the file containing the Wikipedia categories
 * @param exactMatchOnly
 *          if true, then the Wikipedia category must match exactly instead of simply containing the
 *          category string
 * @throws ClassNotFoundException 
 * @throws InterruptedException 
 */
public static void runJob(String input, String output, String catFile, boolean exactMatchOnly,
        Class<? extends Analyzer> analyzerClass)
        throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = new Configuration();
    conf.set("key.value.separator.in.input.line", " ");
    conf.set("xmlinput.start", "<text xml:space=\"preserve\">");
    conf.set("xmlinput.end", "</text>");
    conf.setBoolean("exact.match.only", exactMatchOnly);
    conf.set("analyzer.class", analyzerClass.getName());
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    // Dont ever forget this. People should keep track of how hadoop conf
    // parameters can make or break a piece of code

    Set<String> categories = new HashSet<String>();
    for (String line : new FileLineIterable(new File(catFile))) {
        categories.add(line.trim().toLowerCase(Locale.ENGLISH));
    }

    DefaultStringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(conf,
            GenericsUtil.getClass(categories));

    String categoriesStr = setStringifier.toString(categories);

    conf.set("wikipedia.categories", categoriesStr);

    Job job = new Job(conf);
    if (log.isInfoEnabled()) {
        log.info("Input: {} Out: {} Categories: {}", new Object[] { input, output, catFile });
    }
    job.setJarByClass(WikipediaDatasetCreatorDriver.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setMapperClass(WikipediaDatasetCreatorMapper.class);
    //TODO: job.setNumMapTasks(100);
    job.setInputFormatClass(XmlInputFormat.class);
    job.setReducerClass(WikipediaDatasetCreatorReducer.class);
    job.setOutputFormatClass(WikipediaDatasetCreatorOutputFormat.class);

    FileInputFormat.setInputPaths(job, new Path(input));
    Path outPath = new Path(output);
    FileOutputFormat.setOutputPath(job, outPath);
    HadoopUtil.overwriteOutput(outPath);

    job.waitForCompletion(true);
}

From source file:org.apache.mahout.classifier.bayes.WikipediaDatasetCreatorMapper.java

License:Apache License

@Override
protected void setup(Context context) throws IOException, InterruptedException {
    super.setup(context);
    Configuration conf = context.getConfiguration();
    try {/*from w  w w . j  a  va2s  .c  o  m*/
        if (inputCategories == null) {
            Set<String> newCategories = new HashSet<String>();

            DefaultStringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(conf,
                    GenericsUtil.getClass(newCategories));

            String categoriesStr = setStringifier.toString(newCategories);
            categoriesStr = conf.get("wikipedia.categories", categoriesStr);
            inputCategories = setStringifier.fromString(categoriesStr);

        }
        exactMatchOnly = conf.getBoolean("exact.match.only", false);
        if (analyzer == null) {
            String analyzerStr = conf.get("analyzer.class", WikipediaAnalyzer.class.getName());
            Class<? extends Analyzer> analyzerClass = Class.forName(analyzerStr).asSubclass(Analyzer.class);
            analyzer = analyzerClass.newInstance();
        }
    } catch (IOException ex) {
        throw new IllegalStateException(ex);
    } catch (ClassNotFoundException e) {
        throw new IllegalStateException(e);
    } catch (IllegalAccessException e) {
        throw new IllegalStateException(e);
    } catch (InstantiationException e) {
        throw new IllegalStateException(e);
    }
    log.info("Configure: Input Categories size: {} Exact Match: {} Analyzer: {}",
            new Object[] { inputCategories.size(), exactMatchOnly, analyzer.getClass().getName() });
}

From source file:org.apache.mahout.classifier.cbayes.CBayesNormalizedWeightDriver.java

License:Apache License

/**
 * Run the job/*from  ww w.  j  a va 2  s  . co  m*/
 *
 * @param input  the input pathname String
 * @param output the output pathname String
 */
public static void runJob(String input, String output) throws IOException {
    JobClient client = new JobClient();
    JobConf conf = new JobConf(CBayesNormalizedWeightDriver.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(DoubleWritable.class);
    FileInputFormat.addInputPath(conf, new Path(output + "/trainer-theta"));
    Path outPath = new Path(output + "/trainer-weight");
    FileOutputFormat.setOutputPath(conf, outPath);
    conf.setNumMapTasks(100);
    //conf.setNumReduceTasks(1);
    conf.setMapperClass(CBayesNormalizedWeightMapper.class);
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setCombinerClass(CBayesNormalizedWeightReducer.class);
    conf.setReducerClass(CBayesNormalizedWeightReducer.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.set("io.serializations",
            "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
    // Dont ever forget this. People should keep track of how hadoop conf parameters and make or break a piece of code
    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
    if (dfs.exists(outPath)) {
        dfs.delete(outPath, true);
    }

    Path thetaNormalizationsFiles = new Path(output + "/trainer-thetaNormalizer/part*");
    Map<String, Double> thetaNormalizer = SequenceFileModelReader.readLabelSums(dfs, thetaNormalizationsFiles,
            conf);
    double perLabelWeightSumNormalisationFactor = Double.MAX_VALUE;
    for (Map.Entry<String, Double> stringDoubleEntry1 : thetaNormalizer.entrySet()) {

        double Sigma_W_ij = stringDoubleEntry1.getValue();
        if (perLabelWeightSumNormalisationFactor > Math.abs(Sigma_W_ij)) {
            perLabelWeightSumNormalisationFactor = Math.abs(Sigma_W_ij);
        }
    }

    for (Map.Entry<String, Double> stringDoubleEntry : thetaNormalizer.entrySet()) {
        double Sigma_W_ij = stringDoubleEntry.getValue();
        thetaNormalizer.put(stringDoubleEntry.getKey(), Sigma_W_ij / perLabelWeightSumNormalisationFactor);
    }

    DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(conf,
            GenericsUtil.getClass(thetaNormalizer));
    String thetaNormalizationsString = mapStringifier.toString(thetaNormalizer);

    Map<String, Double> c = mapStringifier.fromString(thetaNormalizationsString);
    log.info("{}", c);
    conf.set("cnaivebayes.thetaNormalizations", thetaNormalizationsString);

    client.setConf(conf);

    JobClient.runJob(conf);

}