Example usage for org.apache.hadoop.io DefaultStringifier DefaultStringifier

List of usage examples for org.apache.hadoop.io DefaultStringifier DefaultStringifier

Introduction

In this page you can find the example usage for org.apache.hadoop.io DefaultStringifier DefaultStringifier.

Prototype

public DefaultStringifier(Configuration conf, Class<T> c) 

Source Link

Usage

From source file:co.nubetech.apache.hadoop.mapred.DBQueryInputFormat.java

License:Apache License

@Override
protected RecordReader<LongWritable, GenericDBWritable> createDBRecordReader(
        org.apache.hadoop.mapreduce.lib.db.DBInputFormat.DBInputSplit split, Configuration conf)
        throws IOException {

    org.apache.hadoop.mapreduce.lib.db.DBConfiguration dbConf = getDBConf();
    @SuppressWarnings("unchecked")
    // Class<T> inputClass = (Class<T>) (dbConf.getInputClass());
    String dbProductName = getDBProductName();

    logger.debug("Creating db record reader for db product: " + dbProductName);
    ArrayList params = null;/* www.jav  a2s  .co  m*/
    try {
        if (conf.get(HIHOConf.QUERY_PARAMS) != null) {
            logger.debug("creating stringifier in DBQueryInputFormat");
            DefaultStringifier<ArrayList> stringifier = new DefaultStringifier<ArrayList>(conf,
                    ArrayList.class);
            logger.debug("created stringifier");

            params = stringifier.fromString(conf.get(HIHOConf.QUERY_PARAMS));
            logger.debug("created params");
        }
        // use database product name to determine appropriate record reader.
        if (dbProductName.startsWith("MYSQL")) {
            // use MySQL-specific db reader.
            return new MySQLQueryRecordReader(split, conf, getConnection(), dbConf, dbConf.getInputConditions(),
                    dbConf.getInputFieldNames(), dbConf.getInputTableName(), params);
        } else {
            // Generic reader.
            return new DBQueryRecordReader(split, conf, getConnection(), dbConf, dbConf.getInputConditions(),
                    dbConf.getInputFieldNames(), dbConf.getInputTableName(), dbProductName, params);
        }
    } catch (SQLException ex) {
        throw new IOException(ex.getMessage());
    }
}

From source file:co.nubetech.apache.hadoop.mapred.DBQueryInputFormat.java

License:Apache License

/**
 * Note that the "orderBy" column is called the "splitBy" in this version.
 * We reuse the same field, but it's not strictly ordering it -- just
 * partitioning the results.// w w  w.  ja  va  2s  . c o m
 */
public static void setInput(Job job, String tableName, String conditions, String splitBy, ArrayList params,
        String... fieldNames) throws IOException {
    DBInputFormat.setInput(job, GenericDBWritable.class, tableName, conditions, splitBy, fieldNames);
    if (params != null) {
        DefaultStringifier<ArrayList> stringifier = new DefaultStringifier<ArrayList>(job.getConfiguration(),
                ArrayList.class);
        job.getConfiguration().set(HIHOConf.QUERY_PARAMS, stringifier.toString(params));
        logger.debug("Converted params and saved them into config");
    }
    job.setInputFormatClass(DBQueryInputFormat.class);
}

From source file:co.nubetech.apache.hadoop.mapred.DBQueryInputFormat.java

License:Apache License

/**
 * setInput() takes a custom query and a separate "bounding query" to use
 * instead of the custom "count query" used by DBInputFormat.
 *///from w  w  w  .  j a v a 2s .com
public static void setInput(JobConf job, String inputQuery, String inputBoundingQuery, ArrayList params)
        throws IOException {
    DBInputFormat.setInput(job, GenericDBWritable.class, inputQuery, "");

    if (inputBoundingQuery != null) {
        job.set(DBConfiguration.INPUT_BOUNDING_QUERY, inputBoundingQuery);
    }
    if (params != null) {
        DefaultStringifier<ArrayList> stringifier = new DefaultStringifier<ArrayList>(job, ArrayList.class);
        job.set(HIHOConf.QUERY_PARAMS, stringifier.toString(params));
        logger.debug("Converted params and saved them into config");
    }
    job.setInputFormat(DBQueryInputFormat.class);
}

From source file:co.nubetech.hiho.mapreduce.lib.db.DBQueryInputFormat.java

License:Apache License

@Override
protected RecordReader<LongWritable, GenericDBWritable> createDBRecordReader(DBInputSplit split,
        Configuration conf) throws IOException {

    DBConfiguration dbConf = getDBConf();
    @SuppressWarnings("unchecked")
    // Class<T> inputClass = (Class<T>) (dbConf.getInputClass());
    String dbProductName = getDBProductName();

    logger.debug("Creating db record reader for db product: " + dbProductName);
    ArrayList params = null;//w  w w.  java  2 s. c  o  m
    try {
        if (conf.get(HIHOConf.QUERY_PARAMS) != null) {
            logger.debug("creating stringifier in DBQueryInputFormat");
            DefaultStringifier<ArrayList> stringifier = new DefaultStringifier<ArrayList>(conf,
                    ArrayList.class);
            logger.debug("created stringifier");

            params = stringifier.fromString(conf.get(HIHOConf.QUERY_PARAMS));
            logger.debug("created params");
        }
        // use database product name to determine appropriate record reader.
        if (dbProductName.startsWith("MYSQL")) {
            // use MySQL-specific db reader.
            return new MySQLQueryRecordReader(split, conf, getConnection(), dbConf, dbConf.getInputConditions(),
                    dbConf.getInputFieldNames(), dbConf.getInputTableName(), params);
        } else {
            // Generic reader.
            return new DBQueryRecordReader(split, conf, getConnection(), dbConf, dbConf.getInputConditions(),
                    dbConf.getInputFieldNames(), dbConf.getInputTableName(), dbProductName, params);
        }
    } catch (SQLException ex) {
        throw new IOException(ex.getMessage());
    }
}

From source file:co.nubetech.hiho.mapreduce.lib.db.DBQueryInputFormat.java

License:Apache License

/**
 * setInput() takes a custom query and a separate "bounding query" to use
 * instead of the custom "count query" used by DBInputFormat.
 *///from   w  w  w .j  a v  a  2s.co m
public static void setInput(Job job, String inputQuery, String inputBoundingQuery, ArrayList params)
        throws IOException {
    DBInputFormat.setInput(job, GenericDBWritable.class, inputQuery, "");
    if (inputBoundingQuery != null) {
        job.getConfiguration().set(DBConfiguration.INPUT_BOUNDING_QUERY, inputBoundingQuery);
    }
    if (params != null) {
        DefaultStringifier<ArrayList> stringifier = new DefaultStringifier<ArrayList>(job.getConfiguration(),
                ArrayList.class);
        job.getConfiguration().set(HIHOConf.QUERY_PARAMS, stringifier.toString(params));
        logger.debug("Converted params and saved them into config");
    }
    job.setInputFormatClass(DBQueryInputFormat.class);
}

From source file:com.ikanow.aleph2.analytics.hadoop.assets.Aleph2MultiInputFormatBuilder.java

License:Apache License

/** Sets the output configurations in the job 
 * @param job/*from  ww  w  .  j  ava  2s  .co m*/
 */
public Job build(final Job job) {

    job.getConfiguration().set(ALEPH2_MULTI_INPUT_FORMAT_JOBS,
            _inputs.keySet().stream().collect(Collectors.joining(",")));
    _inputs.entrySet().stream().forEach(Lambdas.wrap_consumer_u(kv -> {
        try (final Stringifier<Configuration> stringifier = new DefaultStringifier<Configuration>(
                job.getConfiguration(), Configuration.class)) {
            final Configuration new_config = new Configuration(kv.getValue().getConfiguration());
            new_config.set(ALEPH2_MULTI_INPUT_FORMAT_CLAZZ, kv.getValue().getInputFormatClass().getName());
            job.getConfiguration().set(ALEPH2_MULTI_INPUT_FORMAT_PREFIX + kv.getKey(),
                    stringifier.toString(new_config));
        }
    }));
    job.setInputFormatClass(Aleph2MultiInputFormat.class);
    return job;
}

From source file:org.apache.mahout.avro.text.mapred.WikipediaAvroDocumentMapper.java

License:Apache License

@Override
public void configure(JobConf job) {
    try {//  w w  w  . ja  va  2  s  .  co  m
        if (inputCategories == null) {
            Set<String> newCategories = new HashSet<String>();

            DefaultStringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(job,
                    GenericsUtil.getClass(newCategories));

            String categoriesStr = setStringifier.toString(newCategories);
            categoriesStr = job.get("wikipedia.categories", categoriesStr);
            inputCategories = setStringifier.fromString(categoriesStr);

        }
        exactMatchOnly = job.getBoolean("exact.match.only", false);
        all = job.getBoolean("all.files", true);
    } catch (IOException ex) {
        throw new IllegalStateException(ex);
    }
    log.info("Configure: Input Categories size: " + inputCategories.size() + " All: " + all + " Exact Match: "
            + exactMatchOnly);
}

From source file:org.apache.mahout.avro.text.mapred.WikipediaToAvroDocuments.java

License:Apache License

/**
 * Run the job//from w ww  . j a v  a  2  s  .co  m
 * 
 * @param input
 *          the input pathname String
 * @param output
 *          the output pathname String
 * @param catFile
 *          the file containing the Wikipedia categories
 * @param exactMatchOnly
 *          if true, then the Wikipedia category must match exactly instead of
 *          simply containing the category string
 * @param all
 *          if true select all categories
 */
public static int runJob(String input, String output, String catFile, boolean exactMatchOnly, boolean all)
        throws IOException {
    JobClient client = new JobClient();
    JobConf conf = new JobConf(WikipediaToAvroDocuments.class);
    if (log.isInfoEnabled()) {
        log.info("Input: " + input + " Out: " + output + " Categories: " + catFile + " All Files: " + all);
    }

    Path inPath = new Path(input);
    Path outPath = new Path(output);

    FileInputFormat.setInputPaths(conf, inPath);
    FileOutputFormat.setOutputPath(conf, outPath);
    //AvroOutputFormat.setClass(conf, AvroDocument.class);
    //AvroOutputFormat.setSchema(conf, AvroDocument._SCHEMA);

    conf.set("xmlinput.start", "<page>");
    conf.set("xmlinput.end", "</page>");
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(AvroDocument.class);
    conf.setBoolean("exact.match.only", exactMatchOnly);
    conf.setBoolean("all.files", all);
    conf.setMapperClass(WikipediaAvroDocumentMapper.class);
    conf.setInputFormat(XmlInputFormat.class);
    conf.setReducerClass(IdentityReducer.class);
    conf.setOutputFormat(AvroOutputFormat.class);

    AvroOutputFormat.setAvroOutputClass(conf, AvroDocument.class);

    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
    if (dfs.exists(outPath)) {
        dfs.delete(outPath, true);
    }

    Set<String> categories = new HashSet<String>();
    if (catFile.equals("") == false) {
        for (String line : new FileLineIterable(new File(catFile))) {
            categories.add(line.trim().toLowerCase());
        }
    }

    DefaultStringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(conf,
            GenericsUtil.getClass(categories));

    String categoriesStr = setStringifier.toString(categories);

    conf.set("wikipedia.categories", categoriesStr);

    client.setConf(conf);
    RunningJob job = JobClient.runJob(conf);
    job.waitForCompletion();
    return job.isSuccessful() ? 1 : 0;
}

From source file:org.apache.mahout.classifier.bayes.BayesThetaNormalizerDriver.java

License:Apache License

/**
 * Run the job/*www.  j  a  va  2  s.  com*/
 *
 * @param input  the input pathname String
 * @param output the output pathname String
 */
public static void runJob(String input, String output) throws IOException {
    JobClient client = new JobClient();
    JobConf conf = new JobConf(BayesThetaNormalizerDriver.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(DoubleWritable.class);
    FileInputFormat.addInputPath(conf, new Path(output + "/trainer-tfIdf/trainer-tfIdf"));
    Path outPath = new Path(output + "/trainer-thetaNormalizer");
    FileOutputFormat.setOutputPath(conf, outPath);
    conf.setNumMapTasks(100);
    //conf.setNumReduceTasks(1);
    conf.setMapperClass(BayesThetaNormalizerMapper.class);
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setCombinerClass(BayesThetaNormalizerReducer.class);
    conf.setReducerClass(BayesThetaNormalizerReducer.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.set("io.serializations",
            "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
    // Dont ever forget this. People should keep track of how hadoop conf parameters and make or break a piece of code

    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
    if (dfs.exists(outPath)) {
        dfs.delete(outPath, true);
    }

    Path Sigma_kFiles = new Path(output + "/trainer-weights/Sigma_k/*");
    Map<String, Double> labelWeightSum = SequenceFileModelReader.readLabelSums(dfs, Sigma_kFiles, conf);
    DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(conf,
            GenericsUtil.getClass(labelWeightSum));
    String labelWeightSumString = mapStringifier.toString(labelWeightSum);

    log.info("Sigma_k for Each Label");
    Map<String, Double> c = mapStringifier.fromString(labelWeightSumString);
    log.info("{}", c);
    conf.set("cnaivebayes.sigma_k", labelWeightSumString);

    Path sigma_kSigma_jFile = new Path(output + "/trainer-weights/Sigma_kSigma_j/*");
    double sigma_jSigma_k = SequenceFileModelReader.readSigma_jSigma_k(dfs, sigma_kSigma_jFile, conf);
    DefaultStringifier<Double> stringifier = new DefaultStringifier<Double>(conf, Double.class);
    String sigma_jSigma_kString = stringifier.toString(sigma_jSigma_k);

    log.info("Sigma_kSigma_j for each Label and for each Features");
    double retSigma_jSigma_k = stringifier.fromString(sigma_jSigma_kString);
    log.info("{}", retSigma_jSigma_k);
    conf.set("cnaivebayes.sigma_jSigma_k", sigma_jSigma_kString);

    Path vocabCountFile = new Path(output + "/trainer-tfIdf/trainer-vocabCount/*");
    double vocabCount = SequenceFileModelReader.readVocabCount(dfs, vocabCountFile, conf);
    String vocabCountString = stringifier.toString(vocabCount);

    log.info("Vocabulary Count");
    conf.set("cnaivebayes.vocabCount", vocabCountString);
    double retvocabCount = stringifier.fromString(vocabCountString);
    log.info("{}", retvocabCount);

    client.setConf(conf);

    JobClient.runJob(conf);

}

From source file:org.apache.mahout.classifier.bayes.BayesThetaNormalizerMapper.java

License:Apache License

@Override
public void configure(JobConf job) {
    try {// w  ww . jav  a  2 s .  com
        if (labelWeightSum == null) {
            labelWeightSum = new HashMap<String, Double>();

            DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(
                    job, GenericsUtil.getClass(labelWeightSum));

            String labelWeightSumString = mapStringifier.toString(labelWeightSum);
            labelWeightSumString = job.get("cnaivebayes.sigma_k", labelWeightSumString);
            labelWeightSum = mapStringifier.fromString(labelWeightSumString);

            DefaultStringifier<Double> stringifier = new DefaultStringifier<Double>(job,
                    GenericsUtil.getClass(sigma_jSigma_k));
            String sigma_jSigma_kString = stringifier.toString(sigma_jSigma_k);
            sigma_jSigma_kString = job.get("cnaivebayes.sigma_jSigma_k", sigma_jSigma_kString);
            sigma_jSigma_k = stringifier.fromString(sigma_jSigma_kString);

            String vocabCountString = stringifier.toString(vocabCount);
            vocabCountString = job.get("cnaivebayes.vocabCount", vocabCountString);
            vocabCount = stringifier.fromString(vocabCountString);

        }
    } catch (IOException ex) {
        log.warn(ex.toString(), ex);
    }
}