List of usage examples for org.apache.hadoop.mapred Reporter getInputSplit
public abstract InputSplit getInputSplit() throws UnsupportedOperationException;
From source file:br.ufrj.nce.recureco.distributedindex.indexer.IndexerMap.java
License:Open Source License
public void map(LongWritable key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { FileSplit fileSplit = (FileSplit) reporter.getInputSplit(); String filename = fileSplit.getPath().getName(); List<String> tokenizedLine = lineTokenizer.tokenize(value.toString()); for (String auxWord : tokenizedLine) { output.collect(new Text(auxWord), new Text(filename)); }//from w w w . j a v a2 s . c om }
From source file:edu.uci.ics.fuzzyjoin.hadoop.recordpairs.MapBasicJoin.java
License:Apache License
public void map(Object unused, Text record, OutputCollector<IntTripleWritable, Text> output, Reporter reporter) throws IOException { String recordString = record.toString(); if (record.find("" + FuzzyJoinConfig.RECORD_SEPARATOR) >= 0) { /*// w w w .j av a 2 s . co m * VALUE1: RID:Record * * KEY2: 0/1 (0: Relation R, 1: Relation S), RID, 0 * * VALUE2: Record */ String valueSplit[] = recordString.split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX); int relation = 0; if (reporter.getInputSplit().toString().contains(suffixSecond)) { relation = 1; } outputKey.set(relation, Integer.valueOf(valueSplit[FuzzyJoinConfig.RECORD_KEY]), 0); outputValue.set(record); output.collect(outputKey, outputValue); } else { /* * VALUE1: "RID-R RID-S Similarity" * * KEY2: 0/1 (0: Relation R, 1: Relation S), RID, 1 * * VALUE2: "RIDOther Similarity" */ String valueSplit[] = recordString.split(FuzzyJoinConfig.RIDPAIRS_SEPARATOR_REGEX); outputKey.set(0, Integer.parseInt(valueSplit[0]), 1); outputValue.set(valueSplit[1] + FuzzyJoinConfig.RIDPAIRS_SEPARATOR + valueSplit[2]); output.collect(outputKey, outputValue); outputKey.set(1, Integer.parseInt(valueSplit[1]), 1); outputValue.set(valueSplit[0] + FuzzyJoinConfig.RIDPAIRS_SEPARATOR + valueSplit[2]); output.collect(outputKey, outputValue); } }
From source file:edu.uci.ics.fuzzyjoin.hadoop.recordpairs.MapBroadcastJoin.java
License:Apache License
public void map(Object unused, Text record, OutputCollector<IntTripleWritable, Text> output, Reporter reporter) throws IOException { int relation = 0; if (reporter.getInputSplit().toString().contains(suffixSecond)) { relation = 1;//from w w w . j av a 2 s . co m } String valueSplit[] = record.toString().split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX); int rid = Integer.valueOf(valueSplit[FuzzyJoinConfig.RECORD_KEY]); String recordStr = record.toString(); ArrayList<RIDPairSimilarity> set = null; if (relation == 0) { set = joinIndexR.get(rid); } else { set = joinIndexS.get(rid); } if (set != null) { for (RIDPairSimilarity ridSimilarity : set) { key.set(ridSimilarity.rid1, ridSimilarity.rid2, relation); record.set("" + ridSimilarity.similarity + FuzzyJoinConfig.RECORD_EXTRA_SEPARATOR + recordStr); output.collect(key, record); } } }
From source file:edu.uci.ics.fuzzyjoin.hadoop.ridpairs.ppjoin.MapJoin.java
License:Apache License
public void map(Object unused, Text inputValue, OutputCollector<IntTripleWritable, ValueJoin> output, Reporter reporter) throws IOException { ////from w ww. j av a 2 s. co m // get RID and Tokens // String splits[] = inputValue.toString().split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX); int relation = 0; if (reporter.getInputSplit().toString().contains(suffixSecond)) { relation = 1; } int rid = Integer.valueOf(splits[FuzzyJoinConfig.RECORD_KEY]); List<String> tokensUnranked = tokenizer .tokenize(FuzzyJoinUtil.getData(splits, dataColumns[relation], FuzzyJoinConfig.TOKEN_SEPARATOR)); Collection<Integer> tokensRanked = tokenRank.getTokenRanks(tokensUnranked); // // get Tokens as a DataBag and token groups // int length = tokensRanked.size(); HashSet<Integer> tokenGroups = new HashSet<Integer>(); int prefixLength = similarityFilters.getPrefixLength(length); int position = 0; if (recordGroup.isLengthOnly()) { for (Integer group : recordGroup.getGroups(0, length)) { tokenGroups.add(group); } } for (Integer token : tokensRanked) { if (!recordGroup.isLengthOnly()) { if (position < prefixLength) { for (Integer group : recordGroup.getGroups(token, length)) { tokenGroups.add(group); } } position++; } } // // output one pair per group // int lengthKey = similarityFilters.getLengthLowerBound(length); if (relation == 1) { length = lengthKey = tokensUnranked.size(); } // // Key // outputKey.setSecond(lengthKey); outputKey.setThird(relation); // // Value // outputValue.setRelation(relation); outputValue.setRID(rid); outputValue.setLength(length); outputValue.setTokens(tokensRanked); for (Integer group : tokenGroups) { // // Key // outputKey.setFirst(group); // // Collect // output.collect(outputKey, outputValue); // System.out.println("M:" + outputKey + ":" + outputValue); } }
From source file:edu.uci.ics.fuzzyjoin.hadoop.ridpairs.token.MapJoin.java
License:Apache License
public void map(Object unused, Text inputValue, OutputCollector<IntPairWritable, ValueJoin> output, Reporter reporter) throws IOException { //// w w w . j av a2s. c o m // get RID and Tokens // String splits[] = inputValue.toString().split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX); int relation = 0; if (reporter.getInputSplit().toString().contains(suffixSecond)) { relation = 1; } int rid = Integer.valueOf(splits[FuzzyJoinConfig.RECORD_KEY]); List<String> tokens = tokenizer .tokenize(FuzzyJoinUtil.getData(inputValue.toString().split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX), dataColumns[relation], FuzzyJoinConfig.TOKEN_SEPARATOR)); Collection<Integer> tokensRanked = tokenRank.getTokenRanks(tokens); int prefixLength = similarityFilters.getPrefixLength(tokensRanked.size()); int position = 0; outputKey.setSecond(relation); outputValue.setRelation(relation); outputValue.setRID(rid); outputValue.setTokens(tokensRanked); outputValue.setLength(tokens.size()); for (Integer token : tokensRanked) { if (position < prefixLength) { outputKey.setFirst(token); // // collect // output.collect(outputKey, outputValue); } else { break; } position++; } }
From source file:edu.uci.ics.fuzzyjoin.hadoop.ridrecordpairs.ppjoin.MapJoin.java
License:Apache License
public void map(Object unused, Text inputValue, OutputCollector<IntTripleWritable, ValueJoin> output, Reporter reporter) throws IOException { ///* w ww .ja va2 s. c om*/ // get RID and Tokens // String splits[] = inputValue.toString().split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX); int relation = 0; if (reporter.getInputSplit().toString().contains(suffixSecond)) { relation = 1; } int rid = Integer.valueOf(splits[FuzzyJoinConfig.RECORD_KEY]); List<String> tokensUnranked = tokenizer .tokenize(FuzzyJoinUtil.getData(inputValue.toString().split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX), dataColumns[relation], FuzzyJoinConfig.TOKEN_SEPARATOR)); Collection<Integer> tokensRanked = tokenRank.getTokenRanks(tokensUnranked); // // get Tokens as a DataBag and token groups // int length = tokensRanked.size(); HashSet<Integer> tokenGroups = new HashSet<Integer>(); int prefixLength = similarityFilters.getPrefixLength(length); int position = 0; if (recordGroup.isLengthOnly()) { for (Integer group : recordGroup.getGroups(0, length)) { tokenGroups.add(group); } } for (Integer token : tokensRanked) { if (!recordGroup.isLengthOnly()) { if (position < prefixLength) { for (Integer group : recordGroup.getGroups(token, length)) { tokenGroups.add(group); } } position++; } } // // set key length // int lengthKey = similarityFilters.getLengthLowerBound(length); if (relation == 1) { length = lengthKey = tokensUnranked.size(); } // // Key // outputKey.setSecond(lengthKey); outputKey.setThird(relation); // // Value // outputValue.setRelation(relation); outputValue.setRID(rid); outputValue.setLength(length); outputValue.setTokens(tokensRanked); outputValue.setRecord(inputValue); // // output one pair per group // for (Integer group : tokenGroups) { // // Key // outputKey.setFirst(group); // // Collect // output.collect(outputKey, outputValue); } }
From source file:edu.uci.ics.fuzzyjoin.hadoop.ridrecordpairs.token.MapJoin.java
License:Apache License
public void map(Object unused, Text inputValue, OutputCollector<IntPairWritable, ValueJoin> output, Reporter reporter) throws IOException { ///* w w w . j av a 2s .com*/ // get RID and Tokens // String splits[] = inputValue.toString().split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX); int relation = 0; if (reporter.getInputSplit().toString().contains(suffixSecond)) { relation = 1; } int rid = Integer.valueOf(splits[FuzzyJoinConfig.RECORD_KEY]); List<String> tokens = tokenizer .tokenize(FuzzyJoinUtil.getData(inputValue.toString().split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX), dataColumns[relation], FuzzyJoinConfig.TOKEN_SEPARATOR)); Collection<Integer> tokensRanked = tokenRank.getTokenRanks(tokens); int prefixLength = similarityFilters.getPrefixLength(tokensRanked.size()); int position = 0; outputKey.setSecond(relation); outputValue.setRelation(relation); outputValue.setRID(rid); outputValue.setTokens(tokensRanked); outputValue.setLength(tokens.size()); outputValue.setRecord(inputValue); for (Integer token : tokensRanked) { if (position < prefixLength) { outputKey.setFirst(token); // // collect // output.collect(outputKey, outputValue); } else { break; } position++; } }
From source file:fileSearch.fileSearchMapper.java
public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { String line = value.toString(); StringTokenizer tokenizer = new StringTokenizer(line); FileSplit Sp = (FileSplit) reporter.getInputSplit(); while (tokenizer.hasMoreTokens()) { word.set(tokenizer.nextToken()); if (word.find(fileSearchJob.SearchPhrase) > 0) { String Location = Sp.getPath().getName(); output.collect(new Text(Location), one); }/*from ww w . jav a 2 s. co m*/ } }
From source file:hadoopProcesses.map.java
public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { String line = value.toString(); StringTokenizer tokenizer = new StringTokenizer(line); FileSplit Sp = (FileSplit) reporter.getInputSplit(); String Location = Sp.getPath().getName(); while (tokenizer.hasMoreTokens()) { word.set(tokenizer.nextToken()); output.collect(new Text(Location), one); }// w w w .j ava 2 s. c o m }
From source file:invertedIndex.lineIndexMapper.java
public void map(LongWritable key, Text val, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { FileSplit fileSplit = (FileSplit) reporter.getInputSplit(); String fileName = fileSplit.getPath().getName(); location.set(fileName);//from w w w .j a va 2s . c om String line = val.toString(); StringTokenizer itr = new StringTokenizer(line.toLowerCase()); while (itr.hasMoreTokens()) { word.set(itr.nextToken()); output.collect(word, location); } }