List of usage examples for org.apache.hadoop.io DefaultStringifier toString
@Override public String toString(T obj) throws IOException
From source file:org.apache.mahout.text.WikipediaMapper.java
License:Apache License
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration conf = context.getConfiguration(); try {/*from w ww . ja va 2s . c o m*/ if (inputCategories == null) { Set<String> newCategories = new HashSet<String>(); DefaultStringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(conf, GenericsUtil.getClass(newCategories)); String categoriesStr = setStringifier.toString(newCategories); categoriesStr = conf.get("wikipedia.categories", categoriesStr); inputCategories = setStringifier.fromString(categoriesStr); } exactMatchOnly = conf.getBoolean("exact.match.only", false); all = conf.getBoolean("all.files", true); } catch (IOException ex) { throw new IllegalStateException(ex); } log.info("Configure: Input Categories size: {} All: {} Exact Match: {}", new Object[] { inputCategories.size(), all, exactMatchOnly }); }
From source file:redpoll.clusterer.kmeans.KMeansDriver.java
License:Apache License
/** * @param filePath the file path to the single file contains the input clusters. * @param conf// w ww .jav a 2 s . c o m * @param fs * @throws IOException * @throws InstantiationException * @throws IllegalAccessException */ private static void loadClusters(String filePath, JobConf conf, FileSystem fs) throws IOException, InstantiationException, IllegalAccessException { HashMap<String, WritableVector> centers = new HashMap<String, WritableVector>(); Path clusterPath = new Path(filePath); SequenceFile.Reader reader = new SequenceFile.Reader(fs, clusterPath, conf); Text key = new Text(); WritableVector value = (WritableVector) reader.getValueClass().newInstance(); while (reader.next(key, value)) centers.put(key.toString(), value); reader.close(); DefaultStringifier<HashMap<String, WritableVector>> stringifier = new DefaultStringifier<HashMap<String, WritableVector>>( conf, GenericsUtil.getClass(centers)); String centersString = stringifier.toString(centers); conf.set("redpoll.clusterer.kmeans.centers", centersString); }
From source file:redpoll.text.TfIdfDriver.java
License:Apache License
/** * Run the job// ww w .j ava 2 s. c o m * * @param input the input pathname String * @param output the output pathname String */ public static void runJob(String input, String output) throws IOException { JobClient client = new JobClient(); JobConf conf = new JobConf(TfIdfDriver.class); FileSystem fs = FileSystem.get(conf); Path inPath = new Path(input + "/tf"); FileInputFormat.setInputPaths(conf, inPath); Path outPath = new Path(output); FileOutputFormat.setOutputPath(conf, outPath); conf.setMapperClass(TfIdfMapper.class); conf.setReducerClass(TfIdfReducer.class); //conf.setNumMapTasks(10); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(TfIdfWritable.class); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(TfIdfOutputFormat.class); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); // serialize a term hashmap. Its key is the term , value is a term index of // the term vector. Path dfpath = new Path(input + "/df/part-00000"); SequenceFile.Reader reader = new SequenceFile.Reader(fs, dfpath, conf); Text key = new Text(); IntWritable value = new IntWritable(); HashMap<String, Integer> termMap = new HashMap<String, Integer>(); int index = 0; while ((reader.next(key, value))) { String termString = key.toString(); if (!termString.equals("redpoll.docs.num")) { termMap.put(key.toString(), index); index++; } else { conf.setInt("redpoll.docs.num", value.get()); } } reader.close(); DefaultStringifier<HashMap<String, Integer>> mapStringifier = new DefaultStringifier<HashMap<String, Integer>>( conf, GenericsUtil.getClass(termMap)); String termMapString = mapStringifier.toString(termMap); conf.setInt("redpoll.text.terms.num", index); // number of terms conf.set("redpoll.text.terms", termMapString); client.setConf(conf); JobClient.runJob(conf); }