Example usage for org.apache.hadoop.io DefaultStringifier fromString

Introduction

In this page you can find the example usage for org.apache.hadoop.io DefaultStringifier fromString.

Prototype

@Override
    public T fromString(String str) throws IOException

Source Link

Usage

From source file:co.nubetech.apache.hadoop.mapred.DBQueryInputFormat.java

License:Apache License

@Override
protected RecordReader<LongWritable, GenericDBWritable> createDBRecordReader(
        org.apache.hadoop.mapreduce.lib.db.DBInputFormat.DBInputSplit split, Configuration conf)
        throws IOException {

    org.apache.hadoop.mapreduce.lib.db.DBConfiguration dbConf = getDBConf();
    @SuppressWarnings("unchecked")
    // Class<T> inputClass = (Class<T>) (dbConf.getInputClass());
    String dbProductName = getDBProductName();

    logger.debug("Creating db record reader for db product: " + dbProductName);
    ArrayList params = null;/*  w  ww.  j  a v a  2s .  c o  m*/
    try {
        if (conf.get(HIHOConf.QUERY_PARAMS) != null) {
            logger.debug("creating stringifier in DBQueryInputFormat");
            DefaultStringifier<ArrayList> stringifier = new DefaultStringifier<ArrayList>(conf,
                    ArrayList.class);
            logger.debug("created stringifier");

            params = stringifier.fromString(conf.get(HIHOConf.QUERY_PARAMS));
            logger.debug("created params");
        }
        // use database product name to determine appropriate record reader.
        if (dbProductName.startsWith("MYSQL")) {
            // use MySQL-specific db reader.
            return new MySQLQueryRecordReader(split, conf, getConnection(), dbConf, dbConf.getInputConditions(),
                    dbConf.getInputFieldNames(), dbConf.getInputTableName(), params);
        } else {
            // Generic reader.
            return new DBQueryRecordReader(split, conf, getConnection(), dbConf, dbConf.getInputConditions(),
                    dbConf.getInputFieldNames(), dbConf.getInputTableName(), dbProductName, params);
        }
    } catch (SQLException ex) {
        throw new IOException(ex.getMessage());
    }
}

From source file:co.nubetech.hiho.mapreduce.lib.db.DBQueryInputFormat.java

License:Apache License

@Override
protected RecordReader<LongWritable, GenericDBWritable> createDBRecordReader(DBInputSplit split,
        Configuration conf) throws IOException {

    DBConfiguration dbConf = getDBConf();
    @SuppressWarnings("unchecked")
    // Class<T> inputClass = (Class<T>) (dbConf.getInputClass());
    String dbProductName = getDBProductName();

    logger.debug("Creating db record reader for db product: " + dbProductName);
    ArrayList params = null;// w  ww.j  a  v  a 2 s  .com
    try {
        if (conf.get(HIHOConf.QUERY_PARAMS) != null) {
            logger.debug("creating stringifier in DBQueryInputFormat");
            DefaultStringifier<ArrayList> stringifier = new DefaultStringifier<ArrayList>(conf,
                    ArrayList.class);
            logger.debug("created stringifier");

            params = stringifier.fromString(conf.get(HIHOConf.QUERY_PARAMS));
            logger.debug("created params");
        }
        // use database product name to determine appropriate record reader.
        if (dbProductName.startsWith("MYSQL")) {
            // use MySQL-specific db reader.
            return new MySQLQueryRecordReader(split, conf, getConnection(), dbConf, dbConf.getInputConditions(),
                    dbConf.getInputFieldNames(), dbConf.getInputTableName(), params);
        } else {
            // Generic reader.
            return new DBQueryRecordReader(split, conf, getConnection(), dbConf, dbConf.getInputConditions(),
                    dbConf.getInputFieldNames(), dbConf.getInputTableName(), dbProductName, params);
        }
    } catch (SQLException ex) {
        throw new IOException(ex.getMessage());
    }
}

From source file:org.apache.mahout.avro.text.mapred.WikipediaAvroDocumentMapper.java

License:Apache License

@Override
public void configure(JobConf job) {
    try {/*from w  w  w .j ava 2s.c  o  m*/
        if (inputCategories == null) {
            Set<String> newCategories = new HashSet<String>();

            DefaultStringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(job,
                    GenericsUtil.getClass(newCategories));

            String categoriesStr = setStringifier.toString(newCategories);
            categoriesStr = job.get("wikipedia.categories", categoriesStr);
            inputCategories = setStringifier.fromString(categoriesStr);

        }
        exactMatchOnly = job.getBoolean("exact.match.only", false);
        all = job.getBoolean("all.files", true);
    } catch (IOException ex) {
        throw new IllegalStateException(ex);
    }
    log.info("Configure: Input Categories size: " + inputCategories.size() + " All: " + all + " Exact Match: "
            + exactMatchOnly);
}

From source file:org.apache.mahout.classifier.bayes.BayesThetaNormalizerDriver.java

License:Apache License

/**
 * Run the job//  www . ja  v  a2  s.  com
 *
 * @param input  the input pathname String
 * @param output the output pathname String
 */
public static void runJob(String input, String output) throws IOException {
    JobClient client = new JobClient();
    JobConf conf = new JobConf(BayesThetaNormalizerDriver.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(DoubleWritable.class);
    FileInputFormat.addInputPath(conf, new Path(output + "/trainer-tfIdf/trainer-tfIdf"));
    Path outPath = new Path(output + "/trainer-thetaNormalizer");
    FileOutputFormat.setOutputPath(conf, outPath);
    conf.setNumMapTasks(100);
    //conf.setNumReduceTasks(1);
    conf.setMapperClass(BayesThetaNormalizerMapper.class);
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setCombinerClass(BayesThetaNormalizerReducer.class);
    conf.setReducerClass(BayesThetaNormalizerReducer.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.set("io.serializations",
            "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
    // Dont ever forget this. People should keep track of how hadoop conf parameters and make or break a piece of code

    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
    if (dfs.exists(outPath)) {
        dfs.delete(outPath, true);
    }

    Path Sigma_kFiles = new Path(output + "/trainer-weights/Sigma_k/*");
    Map<String, Double> labelWeightSum = SequenceFileModelReader.readLabelSums(dfs, Sigma_kFiles, conf);
    DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(conf,
            GenericsUtil.getClass(labelWeightSum));
    String labelWeightSumString = mapStringifier.toString(labelWeightSum);

    log.info("Sigma_k for Each Label");
    Map<String, Double> c = mapStringifier.fromString(labelWeightSumString);
    log.info("{}", c);
    conf.set("cnaivebayes.sigma_k", labelWeightSumString);

    Path sigma_kSigma_jFile = new Path(output + "/trainer-weights/Sigma_kSigma_j/*");
    double sigma_jSigma_k = SequenceFileModelReader.readSigma_jSigma_k(dfs, sigma_kSigma_jFile, conf);
    DefaultStringifier<Double> stringifier = new DefaultStringifier<Double>(conf, Double.class);
    String sigma_jSigma_kString = stringifier.toString(sigma_jSigma_k);

    log.info("Sigma_kSigma_j for each Label and for each Features");
    double retSigma_jSigma_k = stringifier.fromString(sigma_jSigma_kString);
    log.info("{}", retSigma_jSigma_k);
    conf.set("cnaivebayes.sigma_jSigma_k", sigma_jSigma_kString);

    Path vocabCountFile = new Path(output + "/trainer-tfIdf/trainer-vocabCount/*");
    double vocabCount = SequenceFileModelReader.readVocabCount(dfs, vocabCountFile, conf);
    String vocabCountString = stringifier.toString(vocabCount);

    log.info("Vocabulary Count");
    conf.set("cnaivebayes.vocabCount", vocabCountString);
    double retvocabCount = stringifier.fromString(vocabCountString);
    log.info("{}", retvocabCount);

    client.setConf(conf);

    JobClient.runJob(conf);

}

From source file:org.apache.mahout.classifier.bayes.BayesThetaNormalizerMapper.java

License:Apache License

@Override
public void configure(JobConf job) {
    try {//from  w  w  w .  j a  va  2  s  .c om
        if (labelWeightSum == null) {
            labelWeightSum = new HashMap<String, Double>();

            DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(
                    job, GenericsUtil.getClass(labelWeightSum));

            String labelWeightSumString = mapStringifier.toString(labelWeightSum);
            labelWeightSumString = job.get("cnaivebayes.sigma_k", labelWeightSumString);
            labelWeightSum = mapStringifier.fromString(labelWeightSumString);

            DefaultStringifier<Double> stringifier = new DefaultStringifier<Double>(job,
                    GenericsUtil.getClass(sigma_jSigma_k));
            String sigma_jSigma_kString = stringifier.toString(sigma_jSigma_k);
            sigma_jSigma_kString = job.get("cnaivebayes.sigma_jSigma_k", sigma_jSigma_kString);
            sigma_jSigma_k = stringifier.fromString(sigma_jSigma_kString);

            String vocabCountString = stringifier.toString(vocabCount);
            vocabCountString = job.get("cnaivebayes.vocabCount", vocabCountString);
            vocabCount = stringifier.fromString(vocabCountString);

        }
    } catch (IOException ex) {
        log.warn(ex.toString(), ex);
    }
}

From source file:org.apache.mahout.classifier.bayes.common.BayesFeatureDriver.java

License:Apache License

/**
 * Run the job/*from   www.java  2 s .c  o m*/
 *
 * @param input  the input pathname String
 * @param output the output pathname String
 */
public static void runJob(String input, String output, int gramSize) throws IOException {
    JobClient client = new JobClient();
    JobConf conf = new JobConf(BayesFeatureDriver.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(DoubleWritable.class);

    FileInputFormat.setInputPaths(conf, new Path(input));
    Path outPath = new Path(output);
    FileOutputFormat.setOutputPath(conf, outPath);
    conf.setNumMapTasks(100);
    //conf.setNumReduceTasks(1);
    conf.setMapperClass(BayesFeatureMapper.class);

    conf.setInputFormat(KeyValueTextInputFormat.class);
    conf.setCombinerClass(BayesFeatureReducer.class);
    conf.setReducerClass(BayesFeatureReducer.class);
    conf.setOutputFormat(BayesFeatureOutputFormat.class);

    conf.set("io.serializations",
            "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
    // Dont ever forget this. People should keep track of how hadoop conf parameters and make or break a piece of code

    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
    if (dfs.exists(outPath)) {
        dfs.delete(outPath, true);
    }

    DefaultStringifier<Integer> intStringifier = new DefaultStringifier<Integer>(conf, Integer.class);
    String gramSizeString = intStringifier.toString(gramSize);

    log.info("{}", intStringifier.fromString(gramSizeString));
    conf.set("bayes.gramSize", gramSizeString);

    client.setConf(conf);
    JobClient.runJob(conf);

}

From source file:org.apache.mahout.classifier.bayes.common.BayesFeatureMapper.java

License:Apache License

@Override
public void configure(JobConf job) {
    try {//from w ww.  j a  va 2 s  . c  om

        DefaultStringifier<Integer> intStringifier = new DefaultStringifier<Integer>(job, Integer.class);

        String gramSizeString = intStringifier.toString(gramSize);
        gramSizeString = job.get("bayes.gramSize", gramSizeString);
        gramSize = intStringifier.fromString(gramSizeString);

    } catch (IOException ex) {
        log.warn(ex.toString(), ex);
    }
}

From source file:org.apache.mahout.classifier.bayes.common.BayesTfIdfDriver.java

License:Apache License

/**
 * Run the job//from   w  ww .j  a  va 2s.  c om
 *
 * @param input  the input pathname String
 * @param output the output pathname String
 */
public static void runJob(String input, String output) throws IOException {
    JobClient client = new JobClient();
    JobConf conf = new JobConf(BayesTfIdfDriver.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(DoubleWritable.class);

    FileInputFormat.addInputPath(conf, new Path(output + "/trainer-termDocCount"));
    FileInputFormat.addInputPath(conf, new Path(output + "/trainer-wordFreq"));
    FileInputFormat.addInputPath(conf, new Path(output + "/trainer-featureCount"));
    Path outPath = new Path(output + "/trainer-tfIdf");
    FileOutputFormat.setOutputPath(conf, outPath);
    conf.setNumMapTasks(100);

    conf.setMapperClass(BayesTfIdfMapper.class);
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setCombinerClass(BayesTfIdfReducer.class);
    conf.setReducerClass(BayesTfIdfReducer.class);
    conf.setOutputFormat(BayesTfIdfOutputFormat.class);

    conf.set("io.serializations",
            "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
    // Dont ever forget this. People should keep track of how hadoop conf parameters and make or break a piece of code
    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
    if (dfs.exists(outPath)) {
        dfs.delete(outPath, true);
    }

    Path interimFile = new Path(output + "/trainer-docCount/part-*");

    Map<String, Double> labelDocumentCounts = SequenceFileModelReader.readLabelDocumentCounts(dfs, interimFile,
            conf);

    DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(conf,
            GenericsUtil.getClass(labelDocumentCounts));

    String labelDocumentCountString = mapStringifier.toString(labelDocumentCounts);
    log.info("Counts of documents in Each Label");
    Map<String, Double> c = mapStringifier.fromString(labelDocumentCountString);
    log.info("{}", c);

    conf.set("cnaivebayes.labelDocumentCounts", labelDocumentCountString);

    client.setConf(conf);

    JobClient.runJob(conf);
}

From source file:org.apache.mahout.classifier.bayes.common.BayesTfIdfMapper.java

License:Apache License

@Override
public void configure(JobConf job) {
    try {//from   ww  w  . ja  va  2s  .  c om
        if (labelDocumentCounts == null) {
            labelDocumentCounts = new HashMap<String, Double>();

            DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(
                    job, GenericsUtil.getClass(labelDocumentCounts));

            String labelDocumentCountString = mapStringifier.toString(labelDocumentCounts);
            labelDocumentCountString = job.get("cnaivebayes.labelDocumentCounts", labelDocumentCountString);

            labelDocumentCounts = mapStringifier.fromString(labelDocumentCountString);
        }
    } catch (IOException ex) {
        log.warn(ex.toString(), ex);
    }
}

From source file:org.apache.mahout.classifier.bayes.mapreduce.bayes.BayesThetaNormalizerDriver.java

License:Apache License

@Override
public void runJob(Path input, Path output, BayesParameters params) throws IOException {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(BayesThetaNormalizerDriver.class);

    conf.setJobName("Bayes Theta Normalizer Driver running over input: " + input);

    conf.setOutputKeyClass(StringTuple.class);
    conf.setOutputValueClass(DoubleWritable.class);
    FileInputFormat.addInputPath(conf, new Path(output, "trainer-tfIdf/trainer-tfIdf"));
    Path outPath = new Path(output, "trainer-thetaNormalizer");
    FileOutputFormat.setOutputPath(conf, outPath);
    // conf.setNumMapTasks(100);
    // conf.setNumReduceTasks(1);
    conf.setMapperClass(BayesThetaNormalizerMapper.class);
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setCombinerClass(BayesThetaNormalizerReducer.class);
    conf.setReducerClass(BayesThetaNormalizerReducer.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    // Dont ever forget this. People should keep track of how hadoop conf
    // parameters and make or break a piece of code

    HadoopUtil.overwriteOutput(outPath);
    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);

    Path sigmaKFiles = new Path(output, "trainer-weights/Sigma_k/*");
    Map<String, Double> labelWeightSum = SequenceFileModelReader.readLabelSums(dfs, sigmaKFiles, conf);
    DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(conf,
            GenericsUtil.getClass(labelWeightSum));
    String labelWeightSumString = mapStringifier.toString(labelWeightSum);

    log.info("Sigma_k for Each Label");
    Map<String, Double> c = mapStringifier.fromString(labelWeightSumString);
    log.info("{}", c);
    conf.set("cnaivebayes.sigma_k", labelWeightSumString);

    Path sigmaJSigmaKFile = new Path(output, "trainer-weights/Sigma_kSigma_j/*");
    double sigmaJSigmaK = SequenceFileModelReader.readSigmaJSigmaK(dfs, sigmaJSigmaKFile, conf);
    DefaultStringifier<Double> stringifier = new DefaultStringifier<Double>(conf, Double.class);
    String sigmaJSigmaKString = stringifier.toString(sigmaJSigmaK);

    log.info("Sigma_kSigma_j for each Label and for each Features");
    double retSigmaJSigmaK = stringifier.fromString(sigmaJSigmaKString);
    log.info("{}", retSigmaJSigmaK);
    conf.set("cnaivebayes.sigma_jSigma_k", sigmaJSigmaKString);

    Path vocabCountFile = new Path(output, "trainer-tfIdf/trainer-vocabCount/*");
    double vocabCount = SequenceFileModelReader.readVocabCount(dfs, vocabCountFile, conf);
    String vocabCountString = stringifier.toString(vocabCount);

    log.info("Vocabulary Count");
    conf.set("cnaivebayes.vocabCount", vocabCountString);
    double retvocabCount = stringifier.fromString(vocabCountString);
    log.info("{}", retvocabCount);
    conf.set("bayes.parameters", params.toString());
    conf.set("output.table", output.toString());
    client.setConf(conf);/*  ww w  .  j a v a 2  s .  c o m*/

    JobClient.runJob(conf);

}