Example usage for org.apache.hadoop.mapred Reporter getInputSplit

List of usage examples for org.apache.hadoop.mapred Reporter getInputSplit

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred Reporter getInputSplit.

Prototype

public abstract InputSplit getInputSplit() throws UnsupportedOperationException;

Source Link

Document

Get the InputSplit object for a map.

Usage

From source file:br.ufrj.nce.recureco.distributedindex.indexer.IndexerMap.java

License:Open Source License

public void map(LongWritable key, Text value, OutputCollector<Text, Text> output, Reporter reporter)
        throws IOException {

    FileSplit fileSplit = (FileSplit) reporter.getInputSplit();
    String filename = fileSplit.getPath().getName();

    List<String> tokenizedLine = lineTokenizer.tokenize(value.toString());

    for (String auxWord : tokenizedLine) {
        output.collect(new Text(auxWord), new Text(filename));
    }//from  w w w  . j a  v a2  s .  c om
}

From source file:edu.uci.ics.fuzzyjoin.hadoop.recordpairs.MapBasicJoin.java

License:Apache License

public void map(Object unused, Text record, OutputCollector<IntTripleWritable, Text> output, Reporter reporter)
        throws IOException {
    String recordString = record.toString();
    if (record.find("" + FuzzyJoinConfig.RECORD_SEPARATOR) >= 0) {
        /*// w  w  w  .j av  a  2  s . co  m
         * VALUE1: RID:Record
         * 
         * KEY2: 0/1 (0: Relation R, 1: Relation S), RID, 0
         * 
         * VALUE2: Record
         */
        String valueSplit[] = recordString.split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX);

        int relation = 0;
        if (reporter.getInputSplit().toString().contains(suffixSecond)) {
            relation = 1;
        }

        outputKey.set(relation, Integer.valueOf(valueSplit[FuzzyJoinConfig.RECORD_KEY]), 0);
        outputValue.set(record);
        output.collect(outputKey, outputValue);
    } else {
        /*
         * VALUE1: "RID-R RID-S Similarity"
         * 
         * KEY2: 0/1 (0: Relation R, 1: Relation S), RID, 1
         * 
         * VALUE2: "RIDOther Similarity"
         */
        String valueSplit[] = recordString.split(FuzzyJoinConfig.RIDPAIRS_SEPARATOR_REGEX);

        outputKey.set(0, Integer.parseInt(valueSplit[0]), 1);
        outputValue.set(valueSplit[1] + FuzzyJoinConfig.RIDPAIRS_SEPARATOR + valueSplit[2]);
        output.collect(outputKey, outputValue);

        outputKey.set(1, Integer.parseInt(valueSplit[1]), 1);
        outputValue.set(valueSplit[0] + FuzzyJoinConfig.RIDPAIRS_SEPARATOR + valueSplit[2]);
        output.collect(outputKey, outputValue);
    }
}

From source file:edu.uci.ics.fuzzyjoin.hadoop.recordpairs.MapBroadcastJoin.java

License:Apache License

public void map(Object unused, Text record, OutputCollector<IntTripleWritable, Text> output, Reporter reporter)
        throws IOException {
    int relation = 0;
    if (reporter.getInputSplit().toString().contains(suffixSecond)) {
        relation = 1;//from  w  w  w . j av a  2  s  . co  m
    }

    String valueSplit[] = record.toString().split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX);
    int rid = Integer.valueOf(valueSplit[FuzzyJoinConfig.RECORD_KEY]);
    String recordStr = record.toString();
    ArrayList<RIDPairSimilarity> set = null;
    if (relation == 0) {
        set = joinIndexR.get(rid);
    } else {
        set = joinIndexS.get(rid);
    }
    if (set != null) {
        for (RIDPairSimilarity ridSimilarity : set) {
            key.set(ridSimilarity.rid1, ridSimilarity.rid2, relation);
            record.set("" + ridSimilarity.similarity + FuzzyJoinConfig.RECORD_EXTRA_SEPARATOR + recordStr);
            output.collect(key, record);
        }
    }
}

From source file:edu.uci.ics.fuzzyjoin.hadoop.ridpairs.ppjoin.MapJoin.java

License:Apache License

public void map(Object unused, Text inputValue, OutputCollector<IntTripleWritable, ValueJoin> output,
        Reporter reporter) throws IOException {
    ////from w ww.  j  av  a  2  s.  co m
    // get RID and Tokens
    //
    String splits[] = inputValue.toString().split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX);

    int relation = 0;
    if (reporter.getInputSplit().toString().contains(suffixSecond)) {
        relation = 1;
    }
    int rid = Integer.valueOf(splits[FuzzyJoinConfig.RECORD_KEY]);
    List<String> tokensUnranked = tokenizer
            .tokenize(FuzzyJoinUtil.getData(splits, dataColumns[relation], FuzzyJoinConfig.TOKEN_SEPARATOR));
    Collection<Integer> tokensRanked = tokenRank.getTokenRanks(tokensUnranked);
    //
    // get Tokens as a DataBag and token groups
    //
    int length = tokensRanked.size();
    HashSet<Integer> tokenGroups = new HashSet<Integer>();
    int prefixLength = similarityFilters.getPrefixLength(length);
    int position = 0;
    if (recordGroup.isLengthOnly()) {
        for (Integer group : recordGroup.getGroups(0, length)) {
            tokenGroups.add(group);
        }
    }
    for (Integer token : tokensRanked) {
        if (!recordGroup.isLengthOnly()) {
            if (position < prefixLength) {
                for (Integer group : recordGroup.getGroups(token, length)) {
                    tokenGroups.add(group);
                }
            }
            position++;
        }
    }
    //
    // output one pair per group
    //
    int lengthKey = similarityFilters.getLengthLowerBound(length);
    if (relation == 1) {
        length = lengthKey = tokensUnranked.size();
    }

    //
    // Key
    //
    outputKey.setSecond(lengthKey);
    outputKey.setThird(relation);
    //
    // Value
    //
    outputValue.setRelation(relation);
    outputValue.setRID(rid);
    outputValue.setLength(length);
    outputValue.setTokens(tokensRanked);
    for (Integer group : tokenGroups) {
        //
        // Key
        //
        outputKey.setFirst(group);
        //
        // Collect
        //
        output.collect(outputKey, outputValue);
        // System.out.println("M:" + outputKey + ":" + outputValue);
    }
}

From source file:edu.uci.ics.fuzzyjoin.hadoop.ridpairs.token.MapJoin.java

License:Apache License

public void map(Object unused, Text inputValue, OutputCollector<IntPairWritable, ValueJoin> output,
        Reporter reporter) throws IOException {
    //// w w  w .  j  av a2s. c o m
    // get RID and Tokens
    //
    String splits[] = inputValue.toString().split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX);

    int relation = 0;
    if (reporter.getInputSplit().toString().contains(suffixSecond)) {
        relation = 1;
    }
    int rid = Integer.valueOf(splits[FuzzyJoinConfig.RECORD_KEY]);
    List<String> tokens = tokenizer
            .tokenize(FuzzyJoinUtil.getData(inputValue.toString().split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX),
                    dataColumns[relation], FuzzyJoinConfig.TOKEN_SEPARATOR));
    Collection<Integer> tokensRanked = tokenRank.getTokenRanks(tokens);
    int prefixLength = similarityFilters.getPrefixLength(tokensRanked.size());
    int position = 0;

    outputKey.setSecond(relation);
    outputValue.setRelation(relation);
    outputValue.setRID(rid);
    outputValue.setTokens(tokensRanked);
    outputValue.setLength(tokens.size());
    for (Integer token : tokensRanked) {
        if (position < prefixLength) {
            outputKey.setFirst(token);
            //
            // collect
            //
            output.collect(outputKey, outputValue);
        } else {
            break;
        }
        position++;
    }
}

From source file:edu.uci.ics.fuzzyjoin.hadoop.ridrecordpairs.ppjoin.MapJoin.java

License:Apache License

public void map(Object unused, Text inputValue, OutputCollector<IntTripleWritable, ValueJoin> output,
        Reporter reporter) throws IOException {
    ///* w  ww  .ja  va2  s.  c om*/
    // get RID and Tokens
    //
    String splits[] = inputValue.toString().split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX);

    int relation = 0;
    if (reporter.getInputSplit().toString().contains(suffixSecond)) {
        relation = 1;
    }
    int rid = Integer.valueOf(splits[FuzzyJoinConfig.RECORD_KEY]);
    List<String> tokensUnranked = tokenizer
            .tokenize(FuzzyJoinUtil.getData(inputValue.toString().split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX),
                    dataColumns[relation], FuzzyJoinConfig.TOKEN_SEPARATOR));
    Collection<Integer> tokensRanked = tokenRank.getTokenRanks(tokensUnranked);
    //
    // get Tokens as a DataBag and token groups
    //
    int length = tokensRanked.size();
    HashSet<Integer> tokenGroups = new HashSet<Integer>();
    int prefixLength = similarityFilters.getPrefixLength(length);
    int position = 0;
    if (recordGroup.isLengthOnly()) {
        for (Integer group : recordGroup.getGroups(0, length)) {
            tokenGroups.add(group);
        }
    }
    for (Integer token : tokensRanked) {
        if (!recordGroup.isLengthOnly()) {
            if (position < prefixLength) {
                for (Integer group : recordGroup.getGroups(token, length)) {
                    tokenGroups.add(group);
                }
            }
            position++;
        }
    }
    //
    // set key length
    //
    int lengthKey = similarityFilters.getLengthLowerBound(length);
    if (relation == 1) {
        length = lengthKey = tokensUnranked.size();
    }

    //
    // Key
    //
    outputKey.setSecond(lengthKey);
    outputKey.setThird(relation);
    //
    // Value
    //
    outputValue.setRelation(relation);
    outputValue.setRID(rid);
    outputValue.setLength(length);
    outputValue.setTokens(tokensRanked);
    outputValue.setRecord(inputValue);
    //
    // output one pair per group
    //
    for (Integer group : tokenGroups) {
        //
        // Key
        //
        outputKey.setFirst(group);
        //
        // Collect
        //
        output.collect(outputKey, outputValue);
    }
}

From source file:edu.uci.ics.fuzzyjoin.hadoop.ridrecordpairs.token.MapJoin.java

License:Apache License

public void map(Object unused, Text inputValue, OutputCollector<IntPairWritable, ValueJoin> output,
        Reporter reporter) throws IOException {
    ///* w w w  . j av  a  2s .com*/
    // get RID and Tokens
    //
    String splits[] = inputValue.toString().split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX);

    int relation = 0;
    if (reporter.getInputSplit().toString().contains(suffixSecond)) {
        relation = 1;
    }
    int rid = Integer.valueOf(splits[FuzzyJoinConfig.RECORD_KEY]);
    List<String> tokens = tokenizer
            .tokenize(FuzzyJoinUtil.getData(inputValue.toString().split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX),
                    dataColumns[relation], FuzzyJoinConfig.TOKEN_SEPARATOR));
    Collection<Integer> tokensRanked = tokenRank.getTokenRanks(tokens);
    int prefixLength = similarityFilters.getPrefixLength(tokensRanked.size());
    int position = 0;

    outputKey.setSecond(relation);
    outputValue.setRelation(relation);
    outputValue.setRID(rid);
    outputValue.setTokens(tokensRanked);
    outputValue.setLength(tokens.size());
    outputValue.setRecord(inputValue);
    for (Integer token : tokensRanked) {
        if (position < prefixLength) {
            outputKey.setFirst(token);
            //
            // collect
            //
            output.collect(outputKey, outputValue);
        } else {
            break;
        }
        position++;
    }
}

From source file:fileSearch.fileSearchMapper.java

public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter)
        throws IOException {
    String line = value.toString();
    StringTokenizer tokenizer = new StringTokenizer(line);
    FileSplit Sp = (FileSplit) reporter.getInputSplit();
    while (tokenizer.hasMoreTokens()) {
        word.set(tokenizer.nextToken());
        if (word.find(fileSearchJob.SearchPhrase) > 0) {
            String Location = Sp.getPath().getName();

            output.collect(new Text(Location), one);
        }/*from ww  w . jav a 2 s.  co m*/
    }
}

From source file:hadoopProcesses.map.java

public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter)
        throws IOException {
    String line = value.toString();
    StringTokenizer tokenizer = new StringTokenizer(line);

    FileSplit Sp = (FileSplit) reporter.getInputSplit();
    String Location = Sp.getPath().getName();

    while (tokenizer.hasMoreTokens()) {
        word.set(tokenizer.nextToken());
        output.collect(new Text(Location), one);
    }//  w w  w .j ava 2 s.  c  o m
}

From source file:invertedIndex.lineIndexMapper.java

public void map(LongWritable key, Text val, OutputCollector<Text, Text> output, Reporter reporter)
        throws IOException {

    FileSplit fileSplit = (FileSplit) reporter.getInputSplit();
    String fileName = fileSplit.getPath().getName();
    location.set(fileName);//from   w  w w .j  a  va 2s .  c  om

    String line = val.toString();
    StringTokenizer itr = new StringTokenizer(line.toLowerCase());
    while (itr.hasMoreTokens()) {
        word.set(itr.nextToken());
        output.collect(word, location);
    }
}