List of usage examples for org.apache.hadoop.io DefaultStringifier toString
@Override public String toString(T obj) throws IOException
From source file:org.apache.mahout.classifier.cbayes.CBayesNormalizedWeightMapper.java
License:Apache License
@Override public void configure(JobConf job) { try {/*from w w w . j a va 2 s . c o m*/ if (thetaNormalizer == null) { thetaNormalizer = new HashMap<String, Double>(); DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>( job, GenericsUtil.getClass(thetaNormalizer)); String thetaNormalizationsString = mapStringifier.toString(thetaNormalizer); thetaNormalizationsString = job.get("cnaivebayes.thetaNormalizations", thetaNormalizationsString); thetaNormalizer = mapStringifier.fromString(thetaNormalizationsString); } } catch (IOException ex) { log.warn(ex.toString(), ex); } }
From source file:org.apache.mahout.classifier.cbayes.CBayesThetaDriver.java
License:Apache License
/** * Run the job// w w w . ja va2 s . c om * * @param input the input pathname String * @param output the output pathname String */ public static void runJob(String input, String output) throws IOException { JobClient client = new JobClient(); JobConf conf = new JobConf(CBayesThetaDriver.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(DoubleWritable.class); FileInputFormat.addInputPath(conf, new Path(output + "/trainer-weights/Sigma_j")); FileInputFormat.addInputPath(conf, new Path(output + "/trainer-tfIdf/trainer-tfIdf")); Path outPath = new Path(output + "/trainer-theta"); FileOutputFormat.setOutputPath(conf, outPath); //conf.setNumMapTasks(1); //conf.setNumReduceTasks(1); conf.setMapperClass(CBayesThetaMapper.class); conf.setInputFormat(SequenceFileInputFormat.class); //conf.setCombinerClass(CBayesThetaReducer.class); conf.setReducerClass(CBayesThetaReducer.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization"); // Dont ever forget this. People should keep track of how hadoop conf parameters and make or break a piece of code FileSystem dfs = FileSystem.get(outPath.toUri(), conf); if (dfs.exists(outPath)) { dfs.delete(outPath, true); } Path Sigma_kFiles = new Path(output + "/trainer-weights/Sigma_k/*"); Map<String, Double> labelWeightSum = SequenceFileModelReader.readLabelSums(dfs, Sigma_kFiles, conf); DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(conf, GenericsUtil.getClass(labelWeightSum)); String labelWeightSumString = mapStringifier.toString(labelWeightSum); log.info("Sigma_k for Each Label"); Map<String, Double> c = mapStringifier.fromString(labelWeightSumString); log.info("{}", c); conf.set("cnaivebayes.sigma_k", labelWeightSumString); Path sigma_kSigma_jFile = new Path(output + "/trainer-weights/Sigma_kSigma_j/*"); double sigma_jSigma_k = SequenceFileModelReader.readSigma_jSigma_k(dfs, sigma_kSigma_jFile, conf); DefaultStringifier<Double> stringifier = new DefaultStringifier<Double>(conf, Double.class); String sigma_jSigma_kString = stringifier.toString(sigma_jSigma_k); log.info("Sigma_kSigma_j for each Label and for each Features"); double retSigma_jSigma_k = stringifier.fromString(sigma_jSigma_kString); log.info("{}", retSigma_jSigma_k); conf.set("cnaivebayes.sigma_jSigma_k", sigma_jSigma_kString); Path vocabCountFile = new Path(output + "/trainer-tfIdf/trainer-vocabCount/*"); double vocabCount = SequenceFileModelReader.readVocabCount(dfs, vocabCountFile, conf); String vocabCountString = stringifier.toString(vocabCount); log.info("Vocabulary Count"); conf.set("cnaivebayes.vocabCount", vocabCountString); double retvocabCount = stringifier.fromString(vocabCountString); log.info("{}", retvocabCount); client.setConf(conf); JobClient.runJob(conf); }
From source file:org.apache.mahout.classifier.cbayes.CBayesThetaMapper.java
License:Apache License
@Override public void configure(JobConf job) { try {/*from ww w. j av a2 s. c o m*/ if (labelWeightSum == null) { labelWeightSum = new HashMap<String, Double>(); DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>( job, GenericsUtil.getClass(labelWeightSum)); String labelWeightSumString = mapStringifier.toString(labelWeightSum); labelWeightSumString = job.get("cnaivebayes.sigma_k", labelWeightSumString); labelWeightSum = mapStringifier.fromString(labelWeightSumString); DefaultStringifier<Double> stringifier = new DefaultStringifier<Double>(job, GenericsUtil.getClass(sigma_jSigma_k)); String sigma_jSigma_kString = stringifier.toString(sigma_jSigma_k); sigma_jSigma_kString = job.get("cnaivebayes.sigma_jSigma_k", sigma_jSigma_kString); sigma_jSigma_k = stringifier.fromString(sigma_jSigma_kString); String vocabCountString = stringifier.toString(vocabCount); vocabCountString = job.get("cnaivebayes.vocabCount", vocabCountString); vocabCount = stringifier.fromString(vocabCountString); } } catch (IOException ex) { log.info(ex.toString(), ex); } }
From source file:org.apache.mahout.classifier.cbayes.CBayesThetaNormalizerDriver.java
License:Apache License
/** * Run the job//from w w w. j a va 2s.c om * * @param input the input pathname String * @param output the output pathname String */ public static void runJob(String input, String output) throws IOException { JobClient client = new JobClient(); JobConf conf = new JobConf(CBayesThetaNormalizerDriver.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(DoubleWritable.class); FileInputFormat.addInputPath(conf, new Path(output + "/trainer-weights/Sigma_j")); FileInputFormat.addInputPath(conf, new Path(output + "/trainer-tfIdf/trainer-tfIdf")); Path outPath = new Path(output + "/trainer-thetaNormalizer"); FileOutputFormat.setOutputPath(conf, outPath); conf.setNumMapTasks(100); //conf.setNumReduceTasks(1); conf.setMapperClass(CBayesThetaNormalizerMapper.class); conf.setInputFormat(SequenceFileInputFormat.class); conf.setCombinerClass(CBayesThetaNormalizerReducer.class); conf.setReducerClass(CBayesThetaNormalizerReducer.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization"); // Dont ever forget this. People should keep track of how hadoop conf parameters and make or break a piece of code FileSystem dfs = FileSystem.get(outPath.toUri(), conf); if (dfs.exists(outPath)) { dfs.delete(outPath, true); } Path Sigma_kFiles = new Path(output + "/trainer-weights/Sigma_k/*"); Map<String, Double> labelWeightSum = SequenceFileModelReader.readLabelSums(dfs, Sigma_kFiles, conf); DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(conf, GenericsUtil.getClass(labelWeightSum)); String labelWeightSumString = mapStringifier.toString(labelWeightSum); log.info("Sigma_k for Each Label"); Map<String, Double> c = mapStringifier.fromString(labelWeightSumString); log.info("{}", c); conf.set("cnaivebayes.sigma_k", labelWeightSumString); Path sigma_kSigma_jFile = new Path(output + "/trainer-weights/Sigma_kSigma_j/*"); double sigma_jSigma_k = SequenceFileModelReader.readSigma_jSigma_k(dfs, sigma_kSigma_jFile, conf); DefaultStringifier<Double> stringifier = new DefaultStringifier<Double>(conf, Double.class); String sigma_jSigma_kString = stringifier.toString(sigma_jSigma_k); log.info("Sigma_kSigma_j for each Label and for each Features"); double retSigma_jSigma_k = stringifier.fromString(sigma_jSigma_kString); log.info("{}", retSigma_jSigma_k); conf.set("cnaivebayes.sigma_jSigma_k", sigma_jSigma_kString); Path vocabCountFile = new Path(output + "/trainer-tfIdf/trainer-vocabCount/*"); double vocabCount = SequenceFileModelReader.readVocabCount(dfs, vocabCountFile, conf); String vocabCountString = stringifier.toString(vocabCount); log.info("Vocabulary Count"); conf.set("cnaivebayes.vocabCount", vocabCountString); double retvocabCount = stringifier.fromString(vocabCountString); log.info("{}", retvocabCount); client.setConf(conf); JobClient.runJob(conf); }
From source file:org.apache.mahout.common.Parameters.java
License:Apache License
@Override public String toString() { Configuration conf = new Configuration(); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); DefaultStringifier<Map<String, String>> mapStringifier = new DefaultStringifier<Map<String, String>>(conf, GenericsUtil.getClass(params)); try {/* ww w. j ava2 s .co m*/ return mapStringifier.toString(params); } catch (IOException e) { log.info("Encountered IOException while deserializing returning empty string", e); return ""; } }
From source file:org.apache.mahout.fpm.pfpgrowth.PFPGrowth.java
License:Apache License
/** * Generates the fList from the serialized string representation * //from w w w . ja va2 s . co m * @param params * @param key * @param conf * @return Deserialized Feature Frequency List * @throws IOException */ public static List<Pair<String, Long>> deserializeList(Parameters params, String key, Configuration conf) throws IOException { List<Pair<String, Long>> list = new ArrayList<Pair<String, Long>>(); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); DefaultStringifier<List<Pair<String, Long>>> listStringifier = new DefaultStringifier<List<Pair<String, Long>>>( conf, GenericsUtil.getClass(list)); String serializedString = params.get(key, listStringifier.toString(list)); list = listStringifier.fromString(serializedString); return list; }
From source file:org.apache.mahout.fpm.pfpgrowth.PFPGrowth.java
License:Apache License
/** * Generates the gList(Group ID Mapping of Various frequent Features) Map from the corresponding serialized * representation//from ww w. j a va 2s .c om * * @param params * @param key * @param conf * @return Deserialized Group List * @throws IOException */ public static Map<String, Long> deserializeMap(Parameters params, String key, Configuration conf) throws IOException { Map<String, Long> map = new HashMap<String, Long>(); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); DefaultStringifier<Map<String, Long>> mapStringifier = new DefaultStringifier<Map<String, Long>>(conf, GenericsUtil.getClass(map)); String gListString = params.get(key, mapStringifier.toString(map)); map = mapStringifier.fromString(gListString); return map; }
From source file:org.apache.mahout.fpm.pfpgrowth.PFPGrowth.java
License:Apache License
/** * Serializes the fList and returns the string representation of the List * // ww w. j ava2s. c o m * @param list * @param conf * @return Serialized String representation of List * @throws IOException */ private static String serializeList(List<Pair<String, Long>> list, Configuration conf) throws IOException { conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); DefaultStringifier<List<Pair<String, Long>>> listStringifier = new DefaultStringifier<List<Pair<String, Long>>>( conf, GenericsUtil.getClass(list)); return listStringifier.toString(list); }
From source file:org.apache.mahout.fpm.pfpgrowth.PFPGrowth.java
License:Apache License
/** * Converts a given Map in to a String using DefaultStringifier of Hadoop * //from w ww . ja v a 2 s .com * @param map * @param conf * @return Serialized String representation of the GList Map * @throws IOException */ private static String serializeMap(Map<String, Long> map, Configuration conf) throws IOException { conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); DefaultStringifier<Map<String, Long>> mapStringifier = new DefaultStringifier<Map<String, Long>>(conf, GenericsUtil.getClass(map)); return mapStringifier.toString(map); }
From source file:org.apache.mahout.text.wikipedia.WikipediaDatasetCreatorMapper.java
License:Apache License
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration conf = context.getConfiguration(); if (inputCategories == null) { Set<String> newCategories = Sets.newHashSet(); DefaultStringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(conf, GenericsUtil.getClass(newCategories)); String categoriesStr = conf.get("wikipedia.categories", setStringifier.toString(newCategories)); Set<String> inputCategoriesSet = setStringifier.fromString(categoriesStr); inputCategories = Lists.newArrayList(inputCategoriesSet); inputCategoryPatterns = Lists.newArrayListWithCapacity(inputCategories.size()); for (String inputCategory : inputCategories) { inputCategoryPatterns.add(Pattern.compile(".*\\b" + inputCategory + "\\b.*")); }//from ww w .j a va 2 s . c o m } exactMatchOnly = conf.getBoolean("exact.match.only", false); if (analyzer == null) { String analyzerStr = conf.get("analyzer.class", WikipediaAnalyzer.class.getName()); analyzer = ClassUtils.instantiateAs(analyzerStr, Analyzer.class); } log.info("Configure: Input Categories size: {} Exact Match: {} Analyzer: {}", inputCategories.size(), exactMatchOnly, analyzer.getClass().getName()); }