Example usage for org.apache.hadoop.mapred Reporter getInputSplit

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred Reporter getInputSplit.

Prototype

public abstract InputSplit getInputSplit() throws UnsupportedOperationException;

Source Link

Document

Get the InputSplit object for a map.

Usage

From source file:br.ufrj.nce.recureco.distributedindex.indexer.IndexerMap.java

License:Open Source License

public void map(LongWritable key, Text value, OutputCollector<Text, Text> output, Reporter reporter)
        throws IOException {

    FileSplit fileSplit = (FileSplit) reporter.getInputSplit();
    String filename = fileSplit.getPath().getName();

    List<String> tokenizedLine = lineTokenizer.tokenize(value.toString());

    for (String auxWord : tokenizedLine) {
        output.collect(new Text(auxWord), new Text(filename));
    }//from  w w w  . j a  v a2  s .  c om
}

From source file:edu.uci.ics.fuzzyjoin.hadoop.recordpairs.MapBasicJoin.java

License:Apache License

public void map(Object unused, Text record, OutputCollector<IntTripleWritable, Text> output, Reporter reporter)
        throws IOException {
    String recordString = record.toString();
    if (record.find("" + FuzzyJoinConfig.RECORD_SEPARATOR) >= 0) {
        /*// w  w  w  .j av  a  2  s . co  m
         * VALUE1: RID:Record
         * 
         * KEY2: 0/1 (0: Relation R, 1: Relation S), RID, 0
         * 
         * VALUE2: Record
         */
        String valueSplit[] = recordString.split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX);

        int relation = 0;
        if (reporter.getInputSplit().toString().contains(suffixSecond)) {
            relation = 1;
        }

        outputKey.set(relation, Integer.valueOf(valueSplit[FuzzyJoinConfig.RECORD_KEY]), 0);
        outputValue.set(record);
        output.collect(outputKey, outputValue);
    } else {
        /*
         * VALUE1: "RID-R RID-S Similarity"
         * 
         * KEY2: 0/1 (0: Relation R, 1: Relation S), RID, 1
         * 
         * VALUE2: "RIDOther Similarity"
         */
        String valueSplit[] = recordString.split(FuzzyJoinConfig.RIDPAIRS_SEPARATOR_REGEX);

        outputKey.set(0, Integer.parseInt(valueSplit[0]), 1);
        outputValue.set(valueSplit[1] + FuzzyJoinConfig.RIDPAIRS_SEPARATOR + valueSplit[2]);
        output.collect(outputKey, outputValue);

        outputKey.set(1, Integer.parseInt(valueSplit[1]), 1);
        outputValue.set(valueSplit[0] + FuzzyJoinConfig.RIDPAIRS_SEPARATOR + valueSplit[2]);
        output.collect(outputKey, outputValue);
    }
}

From source file:edu.uci.ics.fuzzyjoin.hadoop.recordpairs.MapBroadcastJoin.java

License:Apache License

public void map(Object unused, Text record, OutputCollector<IntTripleWritable, Text> output, Reporter reporter)
        throws IOException {
    int relation = 0;
    if (reporter.getInputSplit().toString().contains(suffixSecond)) {
        relation = 1;//from  w  w  w . j av a  2  s  . co  m
    }

    String valueSplit[] = record.toString().split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX);
    int rid = Integer.valueOf(valueSplit[FuzzyJoinConfig.RECORD_KEY]);
    String recordStr = record.toString();
    ArrayList<RIDPairSimilarity> set = null;
    if (relation == 0) {
        set = joinIndexR.get(rid);
    } else {
        set = joinIndexS.get(rid);
    }
    if (set != null) {
        for (RIDPairSimilarity ridSimilarity : set) {
            key.set(ridSimilarity.rid1, ridSimilarity.rid2, relation);
            record.set("" + ridSimilarity.similarity + FuzzyJoinConfig.RECORD_EXTRA_SEPARATOR + recordStr);
            output.collect(key, record);
        }
    }
}

From source file:edu.uci.ics.fuzzyjoin.hadoop.ridpairs.ppjoin.MapJoin.java

License:Apache License

public void map(Object unused, Text inputValue, OutputCollector<IntTripleWritable, ValueJoin> output,
        Reporter reporter) throws IOException {
    ////from w ww.  j  av  a  2  s.  co m
    // get RID and Tokens
    //
    String splits[] = inputValue.toString().split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX);

    int relation = 0;
    if (reporter.getInputSplit().toString().contains(suffixSecond)) {
        relation = 1;
    }
    int rid = Integer.valueOf(splits[FuzzyJoinConfig.RECORD_KEY]);
    List<String> tokensUnranked = tokenizer
            .tokenize(FuzzyJoinUtil.getData(splits, dataColumns[relation], FuzzyJoinConfig.TOKEN_SEPARATOR));
    Collection<Integer> tokensRanked = tokenRank.getTokenRanks(tokensUnranked);
    //
    // get Tokens as a DataBag and token groups
    //
    int length = tokensRanked.size();
    HashSet<Integer> tokenGroups = new HashSet<Integer>();
    int prefixLength = similarityFilters.getPrefixLength(length);
    int position = 0;
    if (recordGroup.isLengthOnly()) {
        for (Integer group : recordGroup.getGroups(0, length)) {
            tokenGroups.add(group);
        }
    }
    for (Integer token : tokensRanked) {
        if (!recordGroup.isLengthOnly()) {
            if (position < prefixLength) {
                for (Integer group : recordGroup.getGroups(token, length)) {
                    tokenGroups.add(group);
                }
            }
            position++;
        }
    }
    //
    // output one pair per group
    //
    int lengthKey = similarityFilters.getLengthLowerBound(length);
    if (relation == 1) {
        length = lengthKey = tokensUnranked.size();
    }

    //
    // Key
    //
    outputKey.setSecond(lengthKey);
    outputKey.setThird(relation);
    //
    // Value
    //
    outputValue.setRelation(relation);
    outputValue.setRID(rid);
    outputValue.setLength(length);
    outputValue.setTokens(tokensRanked);
    for (Integer group : tokenGroups) {
        //
        // Key
        //
        outputKey.setFirst(group);
        //
        // Collect
        //
        output.collect(outputKey, outputValue);
        // System.out.println("M:" + outputKey + ":" + outputValue);
    }
}

From source file:edu.uci.ics.fuzzyjoin.hadoop.ridpairs.token.MapJoin.java

License:Apache License

public void map(Object unused, Text inputValue, OutputCollector<IntPairWritable, ValueJoin> output,
        Reporter reporter) throws IOException {
    //// w w  w .  j  av a2s. c o m
    // get RID and Tokens
    //
    String splits[] = inputValue.toString().split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX);

    int relation = 0;
    if (reporter.getInputSplit().toString().contains(suffixSecond)) {
        relation = 1;
    }
    int rid = Integer.valueOf(splits[FuzzyJoinConfig.RECORD_KEY]);
    List<String> tokens = tokenizer
            .tokenize(FuzzyJoinUtil.getData(inputValue.toString().split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX),
                    dataColumns[relation], FuzzyJoinConfig.TOKEN_SEPARATOR));
    Collection<Integer> tokensRanked = tokenRank.getTokenRanks(tokens);
    int prefixLength = similarityFilters.getPrefixLength(tokensRanked.size());
    int position = 0;

    outputKey.setSecond(relation);
    outputValue.setRelation(relation);
    outputValue.setRID(rid);
    outputValue.setTokens(tokensRanked);
    outputValue.setLength(tokens.size());
    for (Integer token : tokensRanked) {
        if (position < prefixLength) {
            outputKey.setFirst(token);
            //
            // collect
            //
            output.collect(outputKey, outputValue);
        } else {
            break;
        }
        position++;
    }
}

From source file:edu.uci.ics.fuzzyjoin.hadoop.ridrecordpairs.ppjoin.MapJoin.java

License:Apache License

public void map(Object unused, Text inputValue, OutputCollector<IntTripleWritable, ValueJoin> output,
        Reporter reporter) throws IOException {
    ///* w  ww  .ja  va2  s.  c om*/
    // get RID and Tokens
    //
    String splits[] = inputValue.toString().split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX);

    int relation = 0;
    if (reporter.getInputSplit().toString().contains(suffixSecond)) {
        relation = 1;
    }
    int rid = Integer.valueOf(splits[FuzzyJoinConfig.RECORD_KEY]);
    List<String> tokensUnranked = tokenizer
            .tokenize(FuzzyJoinUtil.getData(inputValue.toString().split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX),
                    dataColumns[relation], FuzzyJoinConfig.TOKEN_SEPARATOR));
    Collection<Integer> tokensRanked = tokenRank.getTokenRanks(tokensUnranked);
    //
    // get Tokens as a DataBag and token groups
    //
    int length = tokensRanked.size();
    HashSet<Integer> tokenGroups = new HashSet<Integer>();
    int prefixLength = similarityFilters.getPrefixLength(length);
    int position = 0;
    if (recordGroup.isLengthOnly()) {
        for (Integer group : recordGroup.getGroups(0, length)) {
            tokenGroups.add(group);
        }
    }
    for (Integer token : tokensRanked) {
        if (!recordGroup.isLengthOnly()) {
            if (position < prefixLength) {
                for (Integer group : recordGroup.getGroups(token, length)) {
                    tokenGroups.add(group);
                }
            }
            position++;
        }
    }
    //
    // set key length
    //
    int lengthKey = similarityFilters.getLengthLowerBound(length);
    if (relation == 1) {
        length = lengthKey = tokensUnranked.size();
    }

    //
    // Key
    //
    outputKey.setSecond(lengthKey);
    outputKey.setThird(relation);
    //
    // Value
    //
    outputValue.setRelation(relation);
    outputValue.setRID(rid);
    outputValue.setLength(length);
    outputValue.setTokens(tokensRanked);
    outputValue.setRecord(inputValue);
    //
    // output one pair per group
    //
    for (Integer group : tokenGroups) {
        //
        // Key
        //
        outputKey.setFirst(group);
        //
        // Collect
        //
        output.collect(outputKey, outputValue);
    }
}

From source file:edu.uci.ics.fuzzyjoin.hadoop.ridrecordpairs.token.MapJoin.java

License:Apache License

public void map(Object unused, Text inputValue, OutputCollector<IntPairWritable, ValueJoin> output,
        Reporter reporter) throws IOException {
    ///* w w w  . j av  a  2s .com*/
    // get RID and Tokens
    //
    String splits[] = inputValue.toString().split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX);

    int relation = 0;
    if (reporter.getInputSplit().toString().contains(suffixSecond)) {
        relation = 1;
    }
    int rid = Integer.valueOf(splits[FuzzyJoinConfig.RECORD_KEY]);
    List<String> tokens = tokenizer
            .tokenize(FuzzyJoinUtil.getData(inputValue.toString().split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX),
                    dataColumns[relation], FuzzyJoinConfig.TOKEN_SEPARATOR));
    Collection<Integer> tokensRanked = tokenRank.getTokenRanks(tokens);
    int prefixLength = similarityFilters.getPrefixLength(tokensRanked.size());
    int position = 0;

    outputKey.setSecond(relation);
    outputValue.setRelation(relation);
    outputValue.setRID(rid);
    outputValue.setTokens(tokensRanked);
    outputValue.setLength(tokens.size());
    outputValue.setRecord(inputValue);
    for (Integer token : tokensRanked) {
        if (position < prefixLength) {
            outputKey.setFirst(token);
            //
            // collect
            //
            output.collect(outputKey, outputValue);
        } else {
            break;
        }
        position++;
    }
}

From source file:fileSearch.fileSearchMapper.java

public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter)
        throws IOException {
    String line = value.toString();
    StringTokenizer tokenizer = new StringTokenizer(line);
    FileSplit Sp = (FileSplit) reporter.getInputSplit();
    while (tokenizer.hasMoreTokens()) {
        word.set(tokenizer.nextToken());
        if (word.find(fileSearchJob.SearchPhrase) > 0) {
            String Location = Sp.getPath().getName();

            output.collect(new Text(Location), one);
        }/*from ww  w . jav a 2 s.  co m*/
    }
}

From source file:hadoopProcesses.map.java

public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter)
        throws IOException {
    String line = value.toString();
    StringTokenizer tokenizer = new StringTokenizer(line);

    FileSplit Sp = (FileSplit) reporter.getInputSplit();
    String Location = Sp.getPath().getName();

    while (tokenizer.hasMoreTokens()) {
        word.set(tokenizer.nextToken());
        output.collect(new Text(Location), one);
    }//  w w  w .j ava 2 s.  c  o m
}

From source file:invertedIndex.lineIndexMapper.java

public void map(LongWritable key, Text val, OutputCollector<Text, Text> output, Reporter reporter)
        throws IOException {

    FileSplit fileSplit = (FileSplit) reporter.getInputSplit();
    String fileName = fileSplit.getPath().getName();
    location.set(fileName);//from   w  w w .j  a  va 2s .  c  om

    String line = val.toString();
    StringTokenizer itr = new StringTokenizer(line.toLowerCase());
    while (itr.hasMoreTokens()) {
        word.set(itr.nextToken());
        output.collect(word, location);
    }
}