Example usage for org.apache.hadoop.io Text toString

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text toString.

Prototype

@Override
public String toString()

Source Link

Document

Convert text back to string

Usage

From source file:babel.prep.merge.PageMergeReducer.java

License:Apache License

public void reduce(Text key, Iterator<Page> pages, OutputCollector<Text, Page> output, Reporter reporter)
        throws IOException {
    Page newPage = new Page(key.toString());
    int numPages = 0;

    while (pages.hasNext()) {
        newPage.merge(pages.next());/*from w  w w . j  a  va2s  .  c o  m*/
        numPages++;
    }

    PageMerger.Stats.incPageCount();

    if (numPages > 1) {
        PageMerger.Stats.incMergedPageCount();
    }

    output.collect(key, newPage);
}

From source file:base.Example.java

License:Open Source License

public Example(Text value) {
    fids = new int[0];
    fvals = new float[0];
    labels = new int[0];
    docid = 0;//from  w  w w.ja va2 s.c o m
    parseString(value.toString());
}

From source file:bbuzz2011.stackoverflow.preprocess.xml.StackOverflowPostXMLMapper.java

License:Apache License

private void writePostBody(LongWritable key, Text value, Context context)
        throws SAXException, IOException, XPathExpressionException, InterruptedException {
    // TODO Where counters used? May be for some statistics?
    // Are counters global and atomic for all mappers?
    // Where do them output?
    context.getCounter(StackOverflowPostXMLMapper.Counter.TITLES).increment(1);

    Document doc = documentBuilder.parse(new InputSource(new StringReader(value.toString())));

    // Retrieve title from xml post using xpath
    String title = (String) postTitleXPath.evaluate(doc, XPathConstants.STRING);
    if (title == null || title.equals("")) {
        context.getCounter(Counter.MISSING_TITLES).increment(1);
        return;//from   w w w.ja v  a2  s.  com
    }

    String postHtml = (String) postBodyXPath.evaluate(doc, XPathConstants.STRING);
    String content = parser.parsePostContent(postHtml);

    // TODO Why not stackexchange post Id attribute?
    postKey.set((int) key.get());

    postWritable.setTitle(title);
    postWritable.setContent(content);

    // Retrieve questions, not answers
    // TODO as improvement we can combine question and answers as single document for better clustering.
    if (isQuestion(doc)) {
        context.getCounter(Counter.QUESTIONS).increment(1);
        context.write(postKey, postWritable);
    }
}

From source file:be.uantwerpen.adrem.bigfim.AprioriPhaseMapper.java

License:Apache License

@Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    String line = value.toString();
    List<Integer> items = convertLineToSet(line, phase == 1, singletons, delimiter);
    incrementSubSets(items);/*from   w  w  w. j a va 2 s.  c  o m*/
}

From source file:be.uantwerpen.adrem.bigfim.AprioriPhaseReducer.java

License:Apache License

@Override
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
    Map<String, MutableInt> supports = getSupports(values);
    removeLowSupports(supports);//w w w .j  a v a 2  s  . co  m

    if (supports.isEmpty()) {
        return;
    }

    String prefix = key.toString();

    writeShortFis(prefix, supports);
    if (!supports.isEmpty()) {
        writeTrieGroup(prefix, supports);
        updatePGInfo(prefix, supports);
    }
}

From source file:be.uantwerpen.adrem.bigfim.AprioriPhaseReducer.java

License:Apache License

private Map<String, MutableInt> getSupports(Iterable<Text> values) {
    Map<String, MutableInt> supports = newHashMap();
    for (Text extensionAndSupport : values) {
        String[] split = extensionAndSupport.toString().split(" ");
        getFromMap(supports, split[0]).add(parseInt(split[1]));
    }/*from  w  w  w.j  a  va 2 s  .  c  om*/
    return supports;
}

From source file:be.uantwerpen.adrem.bigfim.ComputeTidListMapper.java

License:Apache License

@Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    String line = value.toString();
    List<Integer> items = convertLineToSet(line, phase == 1, singletons, delimiter);
    reportItemTids(context, items);// w  w w.  j a  va 2s.  c o m
    counter++;
}

From source file:be.uantwerpen.adrem.bigfim.ComputeTidListReducer.java

License:Apache License

private void assignToBucket(Text key, Map<Integer, IntArrayWritable[]> map, int totalTids)
        throws IOException, InterruptedException {
    int lowestBucket = getLowestBucket();
    if (!checkLowestBucket(lowestBucket, totalTids)) {
        bucketSizes.add(new MutableInt());
        lowestBucket = bucketSizes.size() - 1;
    }//w  ww  .  j  a  v a2s  .c o m
    bucketSizes.get(lowestBucket).add(totalTids);
    lowestBucket += pgStartIndex;

    String baseOutputPath = basePGDir + "/bucket-" + lowestBucket;
    mos.write(IntArrayWritable.of(key.toString()), EmptyImw, baseOutputPath);
    for (Entry<Integer, IntArrayWritable[]> entry : map.entrySet()) {
        IntArrayWritable owKey = IntArrayWritable.of(entry.getKey());
        IntMatrixWritable owValue = new IntMatrixWritable(entry.getValue());
        mos.write(owKey, owValue, baseOutputPath);
    }
    mos.write(EmptyIaw, EmptyImw, baseOutputPath);
}

From source file:be.uantwerpen.adrem.disteclat.PrefixComputerMapper.java

License:Apache License

@Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    String[] split = value.toString().split("\t");
    String items = split[1];/*from   w w w.  j a  v a 2 s  .  c  o m*/

    // if the prefix length is 1, just report the singletons, otherwise use
    // Eclat to find X-FIs seeds
    EclatMiner miner = new EclatMiner();
    SetReporter reporter = new PrefixItemTIDsReporter(context, prefixLength, singletons, orderMap);

    miner.setSetReporter(reporter);
    miner.setMaxSize(prefixLength);

    for (String itemStr : items.split(" ")) {
        final int itemIx = orderMap.get(Integer.valueOf(itemStr));
        final Item item = singletons.get(itemIx);
        assert (item.id == parseInt(itemStr));
        List<Item> extensions = singletons.subList(itemIx + 1, singletons.size());
        miner.mineRecByPruning(item, extensions, minSup);
    }
}

From source file:be.uantwerpen.adrem.disteclat.PrefixComputerReducer.java

License:Apache License

private void assignToBucket(Text key, Map<Integer, IntArrayWritable[]> map, int totalTids)
        throws IOException, InterruptedException {
    int lowestBucket = getLowestBucket();
    if (!checkLowestBucket(lowestBucket, totalTids)) {
        bucketSizes.add(new MutableInt());
        lowestBucket = bucketSizes.size() - 1;
    }//ww  w  .  ja  v a 2s.  c  o  m
    bucketSizes.get(lowestBucket).add(totalTids);

    String baseOutputPath = "bucket-" + lowestBucket;
    mos.write(IntArrayWritable.of(key.toString()), EmptyImw, baseOutputPath);
    for (Entry<Integer, IntArrayWritable[]> entry : map.entrySet()) {
        IntArrayWritable owKey = IntArrayWritable.of(entry.getKey());
        IntMatrixWritable owValue = new IntMatrixWritable(entry.getValue());
        mos.write(owKey, owValue, baseOutputPath);
    }
    mos.write(EmptyIaw, EmptyImw, baseOutputPath);
}