Example usage for org.apache.hadoop.io DefaultStringifier toString

Introduction

In this page you can find the example usage for org.apache.hadoop.io DefaultStringifier toString.

Prototype

@Override
    public String toString(T obj) throws IOException

Source Link

Usage

From source file:org.apache.mahout.text.WikipediaMapper.java

License:Apache License

@Override
protected void setup(Context context) throws IOException, InterruptedException {
    super.setup(context);
    Configuration conf = context.getConfiguration();
    try {/*from   w ww  .  ja  va  2s  . c o  m*/
        if (inputCategories == null) {
            Set<String> newCategories = new HashSet<String>();

            DefaultStringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(conf,
                    GenericsUtil.getClass(newCategories));

            String categoriesStr = setStringifier.toString(newCategories);
            categoriesStr = conf.get("wikipedia.categories", categoriesStr);
            inputCategories = setStringifier.fromString(categoriesStr);
        }
        exactMatchOnly = conf.getBoolean("exact.match.only", false);
        all = conf.getBoolean("all.files", true);
    } catch (IOException ex) {
        throw new IllegalStateException(ex);
    }
    log.info("Configure: Input Categories size: {} All: {} Exact Match: {}",
            new Object[] { inputCategories.size(), all, exactMatchOnly });
}

From source file:redpoll.clusterer.kmeans.KMeansDriver.java

License:Apache License

/**
 * @param filePath the file path to the single file contains the input clusters.
 * @param conf//  w  ww  .jav  a 2  s . c o m
 * @param fs
 * @throws IOException
 * @throws InstantiationException
 * @throws IllegalAccessException
 */
private static void loadClusters(String filePath, JobConf conf, FileSystem fs)
        throws IOException, InstantiationException, IllegalAccessException {
    HashMap<String, WritableVector> centers = new HashMap<String, WritableVector>();
    Path clusterPath = new Path(filePath);
    SequenceFile.Reader reader = new SequenceFile.Reader(fs, clusterPath, conf);
    Text key = new Text();
    WritableVector value = (WritableVector) reader.getValueClass().newInstance();
    while (reader.next(key, value))
        centers.put(key.toString(), value);
    reader.close();

    DefaultStringifier<HashMap<String, WritableVector>> stringifier = new DefaultStringifier<HashMap<String, WritableVector>>(
            conf, GenericsUtil.getClass(centers));
    String centersString = stringifier.toString(centers);
    conf.set("redpoll.clusterer.kmeans.centers", centersString);
}

From source file:redpoll.text.TfIdfDriver.java

License:Apache License

/**
 * Run the job// ww  w  .j  ava 2  s.  c o  m
 * 
 * @param input the input pathname String
 * @param output the output pathname String
 */
public static void runJob(String input, String output) throws IOException {
    JobClient client = new JobClient();
    JobConf conf = new JobConf(TfIdfDriver.class);

    FileSystem fs = FileSystem.get(conf);
    Path inPath = new Path(input + "/tf");
    FileInputFormat.setInputPaths(conf, inPath);
    Path outPath = new Path(output);
    FileOutputFormat.setOutputPath(conf, outPath);

    conf.setMapperClass(TfIdfMapper.class);
    conf.setReducerClass(TfIdfReducer.class);
    //conf.setNumMapTasks(10);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(TfIdfWritable.class);
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(TfIdfOutputFormat.class);

    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    // serialize a term hashmap. Its key is the term , value is a term index of
    // the term vector.    
    Path dfpath = new Path(input + "/df/part-00000");
    SequenceFile.Reader reader = new SequenceFile.Reader(fs, dfpath, conf);
    Text key = new Text();
    IntWritable value = new IntWritable();
    HashMap<String, Integer> termMap = new HashMap<String, Integer>();
    int index = 0;
    while ((reader.next(key, value))) {
        String termString = key.toString();
        if (!termString.equals("redpoll.docs.num")) {
            termMap.put(key.toString(), index);
            index++;
        } else {
            conf.setInt("redpoll.docs.num", value.get());
        }
    }
    reader.close();
    DefaultStringifier<HashMap<String, Integer>> mapStringifier = new DefaultStringifier<HashMap<String, Integer>>(
            conf, GenericsUtil.getClass(termMap));
    String termMapString = mapStringifier.toString(termMap);
    conf.setInt("redpoll.text.terms.num", index); // number of terms
    conf.set("redpoll.text.terms", termMapString);

    client.setConf(conf);
    JobClient.runJob(conf);
}