List of usage examples for org.apache.hadoop.mapred JobConf getInt
public int getInt(String name, int defaultValue)
name
property as an int
. From source file:edu.ub.ahstfg.indexer.mapred.IndexRecordReader.java
License:Open Source License
/** * Sole constructor.//from w ww .jav a 2 s . co m * @param job Job reading the records. * @param input Split where are the records. * @param reporter Job reporter. * @throws IOException */ public IndexRecordReader(JobConf job, FileSplit input, Reporter reporter) throws IOException { this.reporter = reporter; lineReader = new LineRecordReader(job, input); lineKey = lineReader.createKey(); lineValue = lineReader.createValue(); numMachines = job.getInt(ParamSet.NUM_MACHINES, 10); numDocs = job.getInt(ParamSet.NUM_DOCS, 1000); qDocsPerMapper = numDocs / numMachines; rDocsPerMapper = numDocs - (qDocsPerMapper * numMachines); fillDocsPerMapper(); }
From source file:edu.ubc.mirrors.holographs.mapreduce.SnapshotObjectsOfTypeInputFormat.java
License:Open Source License
@Override public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { ISnapshot snapshot = SnapshotUtils.openSnapshot(job); int[] objectIDs = SnapshotUtils.getInputObjectIDs(job, snapshot); SnapshotFactory.dispose(snapshot);/* w w w .j a v a 2 s. c o m*/ int maxNumObjects = job.getInt("maxNumObjects", objectIDs.length); int numObjects = Math.min(objectIDs.length, maxNumObjects); int approxSplitSize = job.getInt("splitSize", -1); if (approxSplitSize == -1) { approxSplitSize = Math.max(1, (int) (((float) numObjects) / numSplits)); } else { numSplits = Math.max(1, (int) (((float) numObjects) / approxSplitSize)); } InputSplit[] splits = new InputSplit[numSplits]; int offset = 0; for (int index = 0; index < numSplits; index++) { int splitSize = Math.min(numObjects - offset, approxSplitSize); if (index == numSplits - 1) { splitSize = numObjects - offset; } int[] slice = new int[splitSize]; System.arraycopy(objectIDs, offset, slice, 0, splitSize); splits[index] = new ObjectIDArraySplit(slice); offset += splitSize; } return splits; }
From source file:edu.uci.ics.fuzzyjoin.hadoop.datagen.MapNewRecord.java
License:Apache License
@Override public void configure(JobConf job) { ////from w w w . ja v a 2s . c om // create RecordGenerator // int offset = job.getInt(FuzzyJoinDriver.DATA_CRTCOPY_PROPERTY, -1); if (offset == -1) { System.err.println("ERROR: fuzzyjoin.data.crtcopy not set."); System.exit(-1); } recordGenerator = new RecordGenerator(offset); int noRecords = job.getInt(FuzzyJoinDriver.DATA_NORECORDS_PROPERTY, -1); if (noRecords == -1) { System.err.println("ERROR: fuzzyjoin.data.norecords not set."); System.exit(-1); } offsetRID = offset * noRecords; int dictionaryFactor = job.getInt(FuzzyJoinDriver.DATA_DICTIONARY_FACTOR_PROPERTY, 1); // // set RecordGenerator // Path tokenRankFile; try { Path[] cache = DistributedCache.getLocalCacheFiles(job); if (cache == null) { tokenRankFile = new Path(job.get(FuzzyJoinConfig.DATA_TOKENS_PROPERTY)); } else { tokenRankFile = cache[0]; } } catch (IOException e) { throw new RuntimeException(e); } TokenLoad tokenLoad = new TokenLoad(tokenRankFile.toString(), recordGenerator); tokenLoad.loadTokenRank(dictionaryFactor); // // set Tokenizer // tokenizer = TokenizerFactory.getTokenizer( job.get(FuzzyJoinConfig.TOKENIZER_PROPERTY, FuzzyJoinConfig.TOKENIZER_VALUE), FuzzyJoinConfig.WORD_SEPARATOR_REGEX, FuzzyJoinConfig.TOKEN_SEPARATOR); // // set dataColumn // dataColumns = FuzzyJoinUtil .getDataColumns(job.get(FuzzyJoinConfig.RECORD_DATA_PROPERTY, FuzzyJoinConfig.RECORD_DATA_VALUE)); // Arrays.sort(dataColumns); }
From source file:edu.uci.ics.fuzzyjoin.hadoop.datagen.MapRecordOnly.java
License:Apache License
@Override public void configure(JobConf job) { noRecords = job.getInt(FuzzyJoinDriver.DATA_NORECORDS_PROPERTY, -1); }
From source file:edu.uci.ics.fuzzyjoin.hadoop.ridpairs.ppjoin.MapJoin.java
License:Apache License
@Override public void configure(JobConf job) { //// w w w .j a v a 2 s . c o m // set Tokenizer and SimilarityFilters // tokenizer = TokenizerFactory.getTokenizer( job.get(FuzzyJoinConfig.TOKENIZER_PROPERTY, FuzzyJoinConfig.TOKENIZER_VALUE), FuzzyJoinConfig.WORD_SEPARATOR_REGEX, FuzzyJoinConfig.TOKEN_SEPARATOR); String similarityName = job.get(FuzzyJoinConfig.SIMILARITY_NAME_PROPERTY, FuzzyJoinConfig.SIMILARITY_NAME_VALUE); float similarityThreshold = job.getFloat(FuzzyJoinConfig.SIMILARITY_THRESHOLD_PROPERTY, FuzzyJoinConfig.SIMILARITY_THRESHOLD_VALUE); similarityFilters = SimilarityFiltersFactory.getSimilarityFilters(similarityName, similarityThreshold); // // set TokenRank and TokenGroup // Path tokensPath; Path lengthstatsPath = null; try { Path[] cache = DistributedCache.getLocalCacheFiles(job); if (cache == null) { tokensPath = new Path(job.get(FuzzyJoinConfig.DATA_TOKENS_PROPERTY)); try { lengthstatsPath = new Path(job.get(FuzzyJoinDriver.DATA_LENGTHSTATS_PROPERTY)); } catch (IllegalArgumentException e) { } } else { tokensPath = cache[0]; if (job.getBoolean(FuzzyJoinDriver.TOKENS_LENGTHSTATS_PROPERTY, FuzzyJoinDriver.TOKENS_LENGTHSTATS_VALUE)) { lengthstatsPath = cache[1]; } } } catch (IOException e) { throw new RuntimeException(e); } String recordGroupClass = job.get(FuzzyJoinDriver.RIDPAIRS_GROUP_CLASS_PROPERTY, FuzzyJoinDriver.RIDPAIRS_GROUP_CLASS_VALUE); int recordGroupFactor = job.getInt(FuzzyJoinDriver.RIDPAIRS_GROUP_FACTOR_PROPERTY, FuzzyJoinDriver.RIDPAIRS_GROUP_FACTOR_VALUE); recordGroup = RecordGroupFactory.getRecordGroup(recordGroupClass, Math.max(1, job.getNumReduceTasks() * recordGroupFactor), similarityFilters, "" + lengthstatsPath); TokenLoad tokenLoad = new TokenLoad(tokensPath.toString(), tokenRank); tokenLoad.loadTokenRank(); // // set dataColumn // dataColumns[0] = FuzzyJoinUtil .getDataColumns(job.get(FuzzyJoinConfig.RECORD_DATA_PROPERTY, FuzzyJoinConfig.RECORD_DATA_VALUE)); dataColumns[1] = FuzzyJoinUtil .getDataColumns(job.get(FuzzyJoinConfig.RECORD_DATA1_PROPERTY, FuzzyJoinConfig.RECORD_DATA1_VALUE)); // // get suffix for second relation // suffixSecond = job.get(FuzzyJoinDriver.DATA_SUFFIX_INPUT_PROPERTY, "") .split(FuzzyJoinDriver.SEPSARATOR_REGEX)[1]; }
From source file:edu.uci.ics.fuzzyjoin.hadoop.ridpairs.ppjoin.MapSelfJoin.java
License:Apache License
@Override public void configure(JobConf job) { ///*from w ww. ja va 2s.c o m*/ // set Tokenizer and SimilarityFilters // tokenizer = TokenizerFactory.getTokenizer( job.get(FuzzyJoinConfig.TOKENIZER_PROPERTY, FuzzyJoinConfig.TOKENIZER_VALUE), FuzzyJoinConfig.WORD_SEPARATOR_REGEX, FuzzyJoinConfig.TOKEN_SEPARATOR); String similarityName = job.get(FuzzyJoinConfig.SIMILARITY_NAME_PROPERTY, FuzzyJoinConfig.SIMILARITY_NAME_VALUE); float similarityThreshold = job.getFloat(FuzzyJoinConfig.SIMILARITY_THRESHOLD_PROPERTY, FuzzyJoinConfig.SIMILARITY_THRESHOLD_VALUE); similarityFilters = SimilarityFiltersFactory.getSimilarityFilters(similarityName, similarityThreshold); // // set TokenRank and TokenGroup // Path tokensPath; Path lengthstatsPath = null; try { Path[] cache = DistributedCache.getLocalCacheFiles(job); if (cache == null) { tokensPath = new Path(job.get(FuzzyJoinConfig.DATA_TOKENS_PROPERTY)); try { lengthstatsPath = new Path(job.get(FuzzyJoinDriver.DATA_LENGTHSTATS_PROPERTY)); } catch (IllegalArgumentException e) { } } else { tokensPath = cache[0]; if (job.getBoolean(FuzzyJoinDriver.TOKENS_LENGTHSTATS_PROPERTY, FuzzyJoinDriver.TOKENS_LENGTHSTATS_VALUE)) { lengthstatsPath = cache[1]; } } } catch (IOException e) { throw new RuntimeException(e); } String recordGroupClass = job.get(FuzzyJoinDriver.RIDPAIRS_GROUP_CLASS_PROPERTY, FuzzyJoinDriver.RIDPAIRS_GROUP_CLASS_VALUE); int recordGroupFactor = job.getInt(FuzzyJoinDriver.RIDPAIRS_GROUP_FACTOR_PROPERTY, FuzzyJoinDriver.RIDPAIRS_GROUP_FACTOR_VALUE); recordGroup = RecordGroupFactory.getRecordGroup(recordGroupClass, Math.max(1, job.getNumReduceTasks() * recordGroupFactor), similarityFilters, "" + lengthstatsPath); TokenLoad tokenLoad = new TokenLoad(tokensPath.toString(), tokenRank); tokenLoad.loadTokenRank(); // // set dataColumn // dataColumns = FuzzyJoinUtil .getDataColumns(job.get(FuzzyJoinConfig.RECORD_DATA_PROPERTY, FuzzyJoinConfig.RECORD_DATA_VALUE)); }
From source file:edu.uci.ics.fuzzyjoin.hadoop.ridrecordpairs.ppjoin.MapJoin.java
License:Apache License
@Override public void configure(JobConf job) { ///*from w ww . java 2s .com*/ // set Tokenizer and SimilarityFilters // tokenizer = TokenizerFactory.getTokenizer( job.get(FuzzyJoinConfig.TOKENIZER_PROPERTY, FuzzyJoinConfig.TOKENIZER_VALUE), FuzzyJoinConfig.WORD_SEPARATOR_REGEX, FuzzyJoinConfig.TOKEN_SEPARATOR); String similarityName = job.get(FuzzyJoinConfig.SIMILARITY_NAME_PROPERTY, FuzzyJoinConfig.SIMILARITY_NAME_VALUE); float similarityThreshold = job.getFloat(FuzzyJoinConfig.SIMILARITY_THRESHOLD_PROPERTY, FuzzyJoinConfig.SIMILARITY_THRESHOLD_VALUE); similarityFilters = SimilarityFiltersFactory.getSimilarityFilters(similarityName, similarityThreshold); // // set TokenRank and TokenGroup // Path tokensPath; Path lengthstatsPath = null; try { Path[] cache = DistributedCache.getLocalCacheFiles(job); if (cache == null) { tokensPath = new Path(job.get(FuzzyJoinConfig.DATA_TOKENS_PROPERTY)); try { lengthstatsPath = new Path(job.get(FuzzyJoinDriver.DATA_LENGTHSTATS_PROPERTY)); } catch (IllegalArgumentException e) { } } else { tokensPath = cache[0]; if (job.getBoolean(FuzzyJoinDriver.TOKENS_LENGTHSTATS_PROPERTY, FuzzyJoinDriver.TOKENS_LENGTHSTATS_VALUE)) { lengthstatsPath = cache[1]; } } } catch (IOException e) { throw new RuntimeException(e); } String recordGroupClass = job.get(FuzzyJoinDriver.RIDPAIRS_GROUP_CLASS_PROPERTY, FuzzyJoinDriver.RIDPAIRS_GROUP_CLASS_VALUE); int recordGroupFactor = job.getInt(FuzzyJoinDriver.RIDPAIRS_GROUP_FACTOR_PROPERTY, FuzzyJoinDriver.RIDPAIRS_GROUP_FACTOR_VALUE); recordGroup = RecordGroupFactory.getRecordGroup(recordGroupClass, Math.max(1, job.getNumReduceTasks() * recordGroupFactor), similarityFilters, "" + lengthstatsPath); TokenLoad tokenLoad = new TokenLoad(tokensPath.toString(), tokenRank); tokenLoad.loadTokenRank(); // // set dataColumn // dataColumns[0] = FuzzyJoinUtil .getDataColumns(job.get(FuzzyJoinConfig.RECORD_DATA_PROPERTY, FuzzyJoinConfig.RECORD_DATA_VALUE)); dataColumns[1] = FuzzyJoinUtil .getDataColumns(job.get(FuzzyJoinConfig.RECORD_DATA1_PROPERTY, FuzzyJoinConfig.RECORD_DATA1_VALUE)); // // get suffix for second relation // suffixSecond = job.get(FuzzyJoinDriver.DATA_SUFFIX_INPUT_PROPERTY, "") .split(FuzzyJoinDriver.SEPSARATOR_REGEX)[1]; }
From source file:edu.uci.ics.hyracks.hadoop.compat.util.HadoopAdapter.java
License:Apache License
public static ExternalSortOperatorDescriptor getExternalSorter(JobConf conf, IOperatorDescriptorRegistry spec) { ExternalSortOperatorDescriptor externalSortOp = null; RecordDescriptor recordDescriptor = getHadoopRecordDescriptor(conf.getMapOutputKeyClass().getName(), conf.getMapOutputValueClass().getName()); Class<? extends RawComparator> rawComparatorClass = null; WritableComparator writableComparator = WritableComparator .get(conf.getMapOutputKeyClass().asSubclass(WritableComparable.class)); WritableComparingBinaryComparatorFactory comparatorFactory = new WritableComparingBinaryComparatorFactory( writableComparator.getClass()); externalSortOp = new ExternalSortOperatorDescriptor(spec, conf.getInt(HYRACKS_EX_SORT_FRAME_LIMIT, DEFAULT_EX_SORT_FRAME_LIMIT), new int[] { 0 }, new IBinaryComparatorFactory[] { comparatorFactory }, recordDescriptor); return externalSortOp; }
From source file:edu.ucsb.cs.hybrid.HybridDriver.java
License:Apache License
public static void main(String args[]) throws ParseException, IOException { // job.set("mapred.job.tracker", "local"); // job.set("fs.default.name", "file:///"); JobConf job = new JobConf(); job.setJarByClass(HybridDriver.class); new GenericOptionsParser(job, args); setMapperAndRunner(job);//from w w w.ja v a 2s . c o m job.setMapOutputKeyClass(DocDocWritable.class); job.setMapOutputValueClass(FloatWritable.class); job.setNumReduceTasks(0); job.setOutputKeyClass(DocDocWritable.class); job.setOutputValueClass(FloatWritable.class); Path inputPath = new Path(INPUT_DIR); CustomSequenceFileInputFormat.addInputPath(job, inputPath); Path outputPath = new Path(OUTPUT_DIR); job.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(job, outputPath); FileSystem.get(job).delete(outputPath, true); job.setBoolean("fs.hdfs.impl.disable.cache", true); //xun not sure if needed if (job.getBoolean(Config.SPLITABLE_PROPERTY, Config.SPLITABLE_VALUE)) { job.setInputFormat(CustomSequenceFileInputFormat.class); Long splitMB = job.getLong(Config.SPLIT_MB_PROPERTY, Config.SPLIT_MB_VALUE) * 1024 * 1024; job.setLong("mapred.min.split.size", splitMB); job.setLong("mapred.max.split.size", splitMB); job.setLong("dfs.block.size", splitMB); } else { // Comment the following of splitter for www experiments it assumes no splitting // of partitions for load balancing, should be fixed. Splitter.configure(job, inputPath);// remove comment unless for www job.setInputFormat(NonSplitableSequenceInputFormat.class); //remove comment } //SIGIR'14 two-stage balancing //not yet fully incorporated if (job.getInt(Config.LOAD_BALANCE_PROPERTY, Config.LOAD_BALANCE_VALUE) != 0) { TwoStageLoadbalancing.main(job.getInt(Config.LOAD_BALANCE_PROPERTY, Config.LOAD_BALANCE_VALUE), new Path(PartDriver.OUTPUT_DIR), job); } JobSubmitter.run(job, "SIMILARITY", job.getFloat(Config.THRESHOLD_PROPERTY, Config.THRESHOLD_VALUE)); if (job.getBoolean(Config.CONVERT_TEXT_PROPERTY, Config.CONVERT_TEXT_VALUE)) IDMappingJob(args); }
From source file:edu.ucsb.cs.hybrid.HybridDriver.java
License:Apache License
/** * @param job : passed by reference to set its mapper class. *//* w w w. ja v a2s . co m*/ public static void setMapperAndRunner(JobConf job) { int numSplits = job.getInt(Config.NUMBER_SPLITS_PROPERTY, Config.NUMBER_SPLITS_VALUE); int PSSChoice = job.getInt(Config.BLOCK_CHOICE_PROPERTY, Config.BLOCK_CHOICE_VALUE);//1,2 String name = "PSS"; if (numSplits > 1) { //check can I set # splits for runner here? job.setMapRunnerClass(MultipleS_Runner.class); if (job.getBoolean(Config.MULTI_THREADS_PROPERTY, Config.MULTI_THREADS_VALUE)) { // threads testing job.setMapperClass(PSS1_Threaded_Mapper.class);// naming } else if (PSSChoice == 1) { name += "1"; job.setMapperClass(PSS1_Mapper.class); } else if (PSSChoice == 2) { name += "2"; job.setMapperClass(PSS2_Mapper.class);// MultipleS_Block1_Mapper } else ;//For future implementations } else { job.setMapRunnerClass(SingleS_Runner.class); if (job.getBoolean(Config.MULTI_THREADS_PROPERTY, Config.MULTI_THREADS_VALUE)) // threads throw new RuntimeException( "ERROR: Single S with multithreads! Set hybrid.threads.property to false."); if (PSSChoice == 1) { job.setMapperClass(PSS_Mapper.class); if (job.getBoolean(Config.BAYADRO_SKIP_PROPERTY, Config.BAYADRO_SKIP_VALUE)) { name += "/Bayardo_Dynamic_filter"; job.setMapperClass(PSS_Bayardo_Mapper.class);//PSS+Bayardo WWW'07 } } else if (PSSChoice == 2) { name += "2/SingleS"; job.setMapperClass(PSS2_SingleS_Mapper.class); } else job.setMapperClass(PSS3_SingleS_Mapper.class); //what is this? } job.setJobName(name); }