List of usage examples for org.apache.hadoop.io DefaultStringifier fromString
@Override public T fromString(String str) throws IOException
From source file:co.nubetech.apache.hadoop.mapred.DBQueryInputFormat.java
License:Apache License
@Override protected RecordReader<LongWritable, GenericDBWritable> createDBRecordReader( org.apache.hadoop.mapreduce.lib.db.DBInputFormat.DBInputSplit split, Configuration conf) throws IOException { org.apache.hadoop.mapreduce.lib.db.DBConfiguration dbConf = getDBConf(); @SuppressWarnings("unchecked") // Class<T> inputClass = (Class<T>) (dbConf.getInputClass()); String dbProductName = getDBProductName(); logger.debug("Creating db record reader for db product: " + dbProductName); ArrayList params = null;/* w ww. j a v a 2s . c o m*/ try { if (conf.get(HIHOConf.QUERY_PARAMS) != null) { logger.debug("creating stringifier in DBQueryInputFormat"); DefaultStringifier<ArrayList> stringifier = new DefaultStringifier<ArrayList>(conf, ArrayList.class); logger.debug("created stringifier"); params = stringifier.fromString(conf.get(HIHOConf.QUERY_PARAMS)); logger.debug("created params"); } // use database product name to determine appropriate record reader. if (dbProductName.startsWith("MYSQL")) { // use MySQL-specific db reader. return new MySQLQueryRecordReader(split, conf, getConnection(), dbConf, dbConf.getInputConditions(), dbConf.getInputFieldNames(), dbConf.getInputTableName(), params); } else { // Generic reader. return new DBQueryRecordReader(split, conf, getConnection(), dbConf, dbConf.getInputConditions(), dbConf.getInputFieldNames(), dbConf.getInputTableName(), dbProductName, params); } } catch (SQLException ex) { throw new IOException(ex.getMessage()); } }
From source file:co.nubetech.hiho.mapreduce.lib.db.DBQueryInputFormat.java
License:Apache License
@Override protected RecordReader<LongWritable, GenericDBWritable> createDBRecordReader(DBInputSplit split, Configuration conf) throws IOException { DBConfiguration dbConf = getDBConf(); @SuppressWarnings("unchecked") // Class<T> inputClass = (Class<T>) (dbConf.getInputClass()); String dbProductName = getDBProductName(); logger.debug("Creating db record reader for db product: " + dbProductName); ArrayList params = null;// w ww.j a v a 2 s .com try { if (conf.get(HIHOConf.QUERY_PARAMS) != null) { logger.debug("creating stringifier in DBQueryInputFormat"); DefaultStringifier<ArrayList> stringifier = new DefaultStringifier<ArrayList>(conf, ArrayList.class); logger.debug("created stringifier"); params = stringifier.fromString(conf.get(HIHOConf.QUERY_PARAMS)); logger.debug("created params"); } // use database product name to determine appropriate record reader. if (dbProductName.startsWith("MYSQL")) { // use MySQL-specific db reader. return new MySQLQueryRecordReader(split, conf, getConnection(), dbConf, dbConf.getInputConditions(), dbConf.getInputFieldNames(), dbConf.getInputTableName(), params); } else { // Generic reader. return new DBQueryRecordReader(split, conf, getConnection(), dbConf, dbConf.getInputConditions(), dbConf.getInputFieldNames(), dbConf.getInputTableName(), dbProductName, params); } } catch (SQLException ex) { throw new IOException(ex.getMessage()); } }
From source file:org.apache.mahout.avro.text.mapred.WikipediaAvroDocumentMapper.java
License:Apache License
@Override public void configure(JobConf job) { try {/*from w w w .j ava 2s.c o m*/ if (inputCategories == null) { Set<String> newCategories = new HashSet<String>(); DefaultStringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(job, GenericsUtil.getClass(newCategories)); String categoriesStr = setStringifier.toString(newCategories); categoriesStr = job.get("wikipedia.categories", categoriesStr); inputCategories = setStringifier.fromString(categoriesStr); } exactMatchOnly = job.getBoolean("exact.match.only", false); all = job.getBoolean("all.files", true); } catch (IOException ex) { throw new IllegalStateException(ex); } log.info("Configure: Input Categories size: " + inputCategories.size() + " All: " + all + " Exact Match: " + exactMatchOnly); }
From source file:org.apache.mahout.classifier.bayes.BayesThetaNormalizerDriver.java
License:Apache License
/** * Run the job// www . ja v a2 s. com * * @param input the input pathname String * @param output the output pathname String */ public static void runJob(String input, String output) throws IOException { JobClient client = new JobClient(); JobConf conf = new JobConf(BayesThetaNormalizerDriver.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(DoubleWritable.class); FileInputFormat.addInputPath(conf, new Path(output + "/trainer-tfIdf/trainer-tfIdf")); Path outPath = new Path(output + "/trainer-thetaNormalizer"); FileOutputFormat.setOutputPath(conf, outPath); conf.setNumMapTasks(100); //conf.setNumReduceTasks(1); conf.setMapperClass(BayesThetaNormalizerMapper.class); conf.setInputFormat(SequenceFileInputFormat.class); conf.setCombinerClass(BayesThetaNormalizerReducer.class); conf.setReducerClass(BayesThetaNormalizerReducer.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization"); // Dont ever forget this. People should keep track of how hadoop conf parameters and make or break a piece of code FileSystem dfs = FileSystem.get(outPath.toUri(), conf); if (dfs.exists(outPath)) { dfs.delete(outPath, true); } Path Sigma_kFiles = new Path(output + "/trainer-weights/Sigma_k/*"); Map<String, Double> labelWeightSum = SequenceFileModelReader.readLabelSums(dfs, Sigma_kFiles, conf); DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(conf, GenericsUtil.getClass(labelWeightSum)); String labelWeightSumString = mapStringifier.toString(labelWeightSum); log.info("Sigma_k for Each Label"); Map<String, Double> c = mapStringifier.fromString(labelWeightSumString); log.info("{}", c); conf.set("cnaivebayes.sigma_k", labelWeightSumString); Path sigma_kSigma_jFile = new Path(output + "/trainer-weights/Sigma_kSigma_j/*"); double sigma_jSigma_k = SequenceFileModelReader.readSigma_jSigma_k(dfs, sigma_kSigma_jFile, conf); DefaultStringifier<Double> stringifier = new DefaultStringifier<Double>(conf, Double.class); String sigma_jSigma_kString = stringifier.toString(sigma_jSigma_k); log.info("Sigma_kSigma_j for each Label and for each Features"); double retSigma_jSigma_k = stringifier.fromString(sigma_jSigma_kString); log.info("{}", retSigma_jSigma_k); conf.set("cnaivebayes.sigma_jSigma_k", sigma_jSigma_kString); Path vocabCountFile = new Path(output + "/trainer-tfIdf/trainer-vocabCount/*"); double vocabCount = SequenceFileModelReader.readVocabCount(dfs, vocabCountFile, conf); String vocabCountString = stringifier.toString(vocabCount); log.info("Vocabulary Count"); conf.set("cnaivebayes.vocabCount", vocabCountString); double retvocabCount = stringifier.fromString(vocabCountString); log.info("{}", retvocabCount); client.setConf(conf); JobClient.runJob(conf); }
From source file:org.apache.mahout.classifier.bayes.BayesThetaNormalizerMapper.java
License:Apache License
@Override public void configure(JobConf job) { try {//from w w w . j a va 2 s .c om if (labelWeightSum == null) { labelWeightSum = new HashMap<String, Double>(); DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>( job, GenericsUtil.getClass(labelWeightSum)); String labelWeightSumString = mapStringifier.toString(labelWeightSum); labelWeightSumString = job.get("cnaivebayes.sigma_k", labelWeightSumString); labelWeightSum = mapStringifier.fromString(labelWeightSumString); DefaultStringifier<Double> stringifier = new DefaultStringifier<Double>(job, GenericsUtil.getClass(sigma_jSigma_k)); String sigma_jSigma_kString = stringifier.toString(sigma_jSigma_k); sigma_jSigma_kString = job.get("cnaivebayes.sigma_jSigma_k", sigma_jSigma_kString); sigma_jSigma_k = stringifier.fromString(sigma_jSigma_kString); String vocabCountString = stringifier.toString(vocabCount); vocabCountString = job.get("cnaivebayes.vocabCount", vocabCountString); vocabCount = stringifier.fromString(vocabCountString); } } catch (IOException ex) { log.warn(ex.toString(), ex); } }
From source file:org.apache.mahout.classifier.bayes.common.BayesFeatureDriver.java
License:Apache License
/** * Run the job/*from www.java 2 s .c o m*/ * * @param input the input pathname String * @param output the output pathname String */ public static void runJob(String input, String output, int gramSize) throws IOException { JobClient client = new JobClient(); JobConf conf = new JobConf(BayesFeatureDriver.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(DoubleWritable.class); FileInputFormat.setInputPaths(conf, new Path(input)); Path outPath = new Path(output); FileOutputFormat.setOutputPath(conf, outPath); conf.setNumMapTasks(100); //conf.setNumReduceTasks(1); conf.setMapperClass(BayesFeatureMapper.class); conf.setInputFormat(KeyValueTextInputFormat.class); conf.setCombinerClass(BayesFeatureReducer.class); conf.setReducerClass(BayesFeatureReducer.class); conf.setOutputFormat(BayesFeatureOutputFormat.class); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization"); // Dont ever forget this. People should keep track of how hadoop conf parameters and make or break a piece of code FileSystem dfs = FileSystem.get(outPath.toUri(), conf); if (dfs.exists(outPath)) { dfs.delete(outPath, true); } DefaultStringifier<Integer> intStringifier = new DefaultStringifier<Integer>(conf, Integer.class); String gramSizeString = intStringifier.toString(gramSize); log.info("{}", intStringifier.fromString(gramSizeString)); conf.set("bayes.gramSize", gramSizeString); client.setConf(conf); JobClient.runJob(conf); }
From source file:org.apache.mahout.classifier.bayes.common.BayesFeatureMapper.java
License:Apache License
@Override public void configure(JobConf job) { try {//from w ww. j a va 2 s . c om DefaultStringifier<Integer> intStringifier = new DefaultStringifier<Integer>(job, Integer.class); String gramSizeString = intStringifier.toString(gramSize); gramSizeString = job.get("bayes.gramSize", gramSizeString); gramSize = intStringifier.fromString(gramSizeString); } catch (IOException ex) { log.warn(ex.toString(), ex); } }
From source file:org.apache.mahout.classifier.bayes.common.BayesTfIdfDriver.java
License:Apache License
/** * Run the job//from w ww .j a va 2s. c om * * @param input the input pathname String * @param output the output pathname String */ public static void runJob(String input, String output) throws IOException { JobClient client = new JobClient(); JobConf conf = new JobConf(BayesTfIdfDriver.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(DoubleWritable.class); FileInputFormat.addInputPath(conf, new Path(output + "/trainer-termDocCount")); FileInputFormat.addInputPath(conf, new Path(output + "/trainer-wordFreq")); FileInputFormat.addInputPath(conf, new Path(output + "/trainer-featureCount")); Path outPath = new Path(output + "/trainer-tfIdf"); FileOutputFormat.setOutputPath(conf, outPath); conf.setNumMapTasks(100); conf.setMapperClass(BayesTfIdfMapper.class); conf.setInputFormat(SequenceFileInputFormat.class); conf.setCombinerClass(BayesTfIdfReducer.class); conf.setReducerClass(BayesTfIdfReducer.class); conf.setOutputFormat(BayesTfIdfOutputFormat.class); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization"); // Dont ever forget this. People should keep track of how hadoop conf parameters and make or break a piece of code FileSystem dfs = FileSystem.get(outPath.toUri(), conf); if (dfs.exists(outPath)) { dfs.delete(outPath, true); } Path interimFile = new Path(output + "/trainer-docCount/part-*"); Map<String, Double> labelDocumentCounts = SequenceFileModelReader.readLabelDocumentCounts(dfs, interimFile, conf); DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(conf, GenericsUtil.getClass(labelDocumentCounts)); String labelDocumentCountString = mapStringifier.toString(labelDocumentCounts); log.info("Counts of documents in Each Label"); Map<String, Double> c = mapStringifier.fromString(labelDocumentCountString); log.info("{}", c); conf.set("cnaivebayes.labelDocumentCounts", labelDocumentCountString); client.setConf(conf); JobClient.runJob(conf); }
From source file:org.apache.mahout.classifier.bayes.common.BayesTfIdfMapper.java
License:Apache License
@Override public void configure(JobConf job) { try {//from ww w . ja va 2s . c om if (labelDocumentCounts == null) { labelDocumentCounts = new HashMap<String, Double>(); DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>( job, GenericsUtil.getClass(labelDocumentCounts)); String labelDocumentCountString = mapStringifier.toString(labelDocumentCounts); labelDocumentCountString = job.get("cnaivebayes.labelDocumentCounts", labelDocumentCountString); labelDocumentCounts = mapStringifier.fromString(labelDocumentCountString); } } catch (IOException ex) { log.warn(ex.toString(), ex); } }
From source file:org.apache.mahout.classifier.bayes.mapreduce.bayes.BayesThetaNormalizerDriver.java
License:Apache License
@Override public void runJob(Path input, Path output, BayesParameters params) throws IOException { Configurable client = new JobClient(); JobConf conf = new JobConf(BayesThetaNormalizerDriver.class); conf.setJobName("Bayes Theta Normalizer Driver running over input: " + input); conf.setOutputKeyClass(StringTuple.class); conf.setOutputValueClass(DoubleWritable.class); FileInputFormat.addInputPath(conf, new Path(output, "trainer-tfIdf/trainer-tfIdf")); Path outPath = new Path(output, "trainer-thetaNormalizer"); FileOutputFormat.setOutputPath(conf, outPath); // conf.setNumMapTasks(100); // conf.setNumReduceTasks(1); conf.setMapperClass(BayesThetaNormalizerMapper.class); conf.setInputFormat(SequenceFileInputFormat.class); conf.setCombinerClass(BayesThetaNormalizerReducer.class); conf.setReducerClass(BayesThetaNormalizerReducer.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); // Dont ever forget this. People should keep track of how hadoop conf // parameters and make or break a piece of code HadoopUtil.overwriteOutput(outPath); FileSystem dfs = FileSystem.get(outPath.toUri(), conf); Path sigmaKFiles = new Path(output, "trainer-weights/Sigma_k/*"); Map<String, Double> labelWeightSum = SequenceFileModelReader.readLabelSums(dfs, sigmaKFiles, conf); DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(conf, GenericsUtil.getClass(labelWeightSum)); String labelWeightSumString = mapStringifier.toString(labelWeightSum); log.info("Sigma_k for Each Label"); Map<String, Double> c = mapStringifier.fromString(labelWeightSumString); log.info("{}", c); conf.set("cnaivebayes.sigma_k", labelWeightSumString); Path sigmaJSigmaKFile = new Path(output, "trainer-weights/Sigma_kSigma_j/*"); double sigmaJSigmaK = SequenceFileModelReader.readSigmaJSigmaK(dfs, sigmaJSigmaKFile, conf); DefaultStringifier<Double> stringifier = new DefaultStringifier<Double>(conf, Double.class); String sigmaJSigmaKString = stringifier.toString(sigmaJSigmaK); log.info("Sigma_kSigma_j for each Label and for each Features"); double retSigmaJSigmaK = stringifier.fromString(sigmaJSigmaKString); log.info("{}", retSigmaJSigmaK); conf.set("cnaivebayes.sigma_jSigma_k", sigmaJSigmaKString); Path vocabCountFile = new Path(output, "trainer-tfIdf/trainer-vocabCount/*"); double vocabCount = SequenceFileModelReader.readVocabCount(dfs, vocabCountFile, conf); String vocabCountString = stringifier.toString(vocabCount); log.info("Vocabulary Count"); conf.set("cnaivebayes.vocabCount", vocabCountString); double retvocabCount = stringifier.fromString(vocabCountString); log.info("{}", retvocabCount); conf.set("bayes.parameters", params.toString()); conf.set("output.table", output.toString()); client.setConf(conf);/* ww w . j a v a 2 s . c o m*/ JobClient.runJob(conf); }