List of usage examples for org.apache.hadoop.mapred JobConf get
public String get(String name)
name
property, null
if no such property exists. From source file:edu.uci.ics.fuzzyjoin.hadoop.datagen.MapNewRecord.java
License:Apache License
@Override public void configure(JobConf job) { //// w w w . j av a 2 s . c om // create RecordGenerator // int offset = job.getInt(FuzzyJoinDriver.DATA_CRTCOPY_PROPERTY, -1); if (offset == -1) { System.err.println("ERROR: fuzzyjoin.data.crtcopy not set."); System.exit(-1); } recordGenerator = new RecordGenerator(offset); int noRecords = job.getInt(FuzzyJoinDriver.DATA_NORECORDS_PROPERTY, -1); if (noRecords == -1) { System.err.println("ERROR: fuzzyjoin.data.norecords not set."); System.exit(-1); } offsetRID = offset * noRecords; int dictionaryFactor = job.getInt(FuzzyJoinDriver.DATA_DICTIONARY_FACTOR_PROPERTY, 1); // // set RecordGenerator // Path tokenRankFile; try { Path[] cache = DistributedCache.getLocalCacheFiles(job); if (cache == null) { tokenRankFile = new Path(job.get(FuzzyJoinConfig.DATA_TOKENS_PROPERTY)); } else { tokenRankFile = cache[0]; } } catch (IOException e) { throw new RuntimeException(e); } TokenLoad tokenLoad = new TokenLoad(tokenRankFile.toString(), recordGenerator); tokenLoad.loadTokenRank(dictionaryFactor); // // set Tokenizer // tokenizer = TokenizerFactory.getTokenizer( job.get(FuzzyJoinConfig.TOKENIZER_PROPERTY, FuzzyJoinConfig.TOKENIZER_VALUE), FuzzyJoinConfig.WORD_SEPARATOR_REGEX, FuzzyJoinConfig.TOKEN_SEPARATOR); // // set dataColumn // dataColumns = FuzzyJoinUtil .getDataColumns(job.get(FuzzyJoinConfig.RECORD_DATA_PROPERTY, FuzzyJoinConfig.RECORD_DATA_VALUE)); // Arrays.sort(dataColumns); }
From source file:edu.uci.ics.fuzzyjoin.hadoop.recordpairs.MapBroadcastJoin.java
License:Apache License
@Override public void configure(JobConf job) { LOG.info("Configure START"); ///* w w w .java 2 s . c o m*/ // read join index // Path[] cache = null; try { cache = DistributedCache.getLocalCacheFiles(job); } catch (IOException e) { throw new RuntimeException(e); } if (cache == null) { addJoinIndex(new Path(job.get(FuzzyJoinDriver.DATA_JOININDEX_PROPERTY))); } else { for (Path path : cache) { addJoinIndex(path); } } // // get suffix for second relation // suffixSecond = job.get(FuzzyJoinDriver.DATA_SUFFIX_INPUT_PROPERTY, "") .split(FuzzyJoinDriver.SEPSARATOR_REGEX)[1]; LOG.info("Configure END"); }
From source file:edu.uci.ics.fuzzyjoin.hadoop.recordpairs.MapBroadcastSelfJoin.java
License:Apache License
@Override public void configure(JobConf job) { LOG.info("Configure START"); ///* ww w .j av a2 s .c o m*/ // read join index // Path[] cache = null; try { cache = DistributedCache.getLocalCacheFiles(job); } catch (IOException e) { throw new RuntimeException(e); } if (cache == null) { addJoinIndex(new Path(job.get(FuzzyJoinDriver.DATA_JOININDEX_PROPERTY))); } else { for (Path path : cache) { addJoinIndex(path); } } LOG.info("Configure END"); }
From source file:edu.uci.ics.hyracks.dataflow.hadoop.AbstractHadoopOperatorDescriptor.java
License:Apache License
public void populateCache(JobConf jobConf) { try {/*from www . j a v a2 s . c o m*/ String cache = jobConf.get(MAPRED_CACHE_FILES); System.out.println("cache:" + cache); if (cache == null) { return; } String localCache = jobConf.get(MAPRED_CACHE_LOCALFILES); System.out.println("localCache:" + localCache); if (localCache != null) { return; } localCache = ""; StringTokenizer cacheTokenizer = new StringTokenizer(cache, ","); while (cacheTokenizer.hasMoreTokens()) { if (!"".equals(localCache)) { localCache += ","; } try { localCache += DCacheClient.get().get(cacheTokenizer.nextToken()); } catch (IOException e) { throw new RuntimeException(e); } } jobConf.set(MAPRED_CACHE_LOCALFILES, localCache); System.out.println("localCache:" + localCache); } catch (Exception e) { } }
From source file:edu.ucsb.cs.hybrid.mappers.PSS3_SingleS_Mapper.java
License:Apache License
@Override public void configure(JobConf job) { super.configure(job); PostingDocWeight[] b = new PostingDocWeight[1]; for (i = 0; i < r; i++) Spointers.addCol(b);/*w ww. jav a 2s .com*/ ArrayList<PostingDocWeight> a = new ArrayList<PostingDocWeight>(); for (i = 0; i < r; i++) Bpointers.addRow(a); this.range = Integer.parseInt(job.get("RANGE")); this.r = Integer.parseInt(job.get("R")); Bdone = new BitSet(blockSize); blockCurrent = new IndexTermWeight[blockSize]; for (i = 0; i < blockSize; i++) blockCurrent[i] = new IndexTermWeight(); BdocumentLen = new int[blockSize]; }
From source file:edu.ucsb.cs.hybrid.mappers.SingleS_Runner.java
License:Apache License
public static Reader getReader(JobConf conf) throws IOException { boolean oneMap = conf.getBoolean(Config.SINGLE_MAP_PROPERTY, Config.SINGLE_MAP_VALUE); boolean splittable = conf.getBoolean(Config.SPLITABLE_PROPERTY, Config.SPLITABLE_VALUE); if (!oneMap || splittable) return new Reader(conf, new Path(conf.get("map.input.file")), conf.getInt(Config.COMP_BLOCK_PROPERTY, Config.COMP_BLOCK_VALUE)); else//w w w. j a va 2s.c o m return new OneMapReader(conf, new Path(conf.get("map.input.file")), conf.getInt(Config.COMP_BLOCK_PROPERTY, Config.COMP_BLOCK_VALUE)); }
From source file:edu.ucsb.cs.preprocessing.sequence.SeqMapper.java
License:Apache License
@Override public void configure(JobConf job) { Path idPath = new Path(HashPagesDriver.IDS_FILE1); readIdMappings(job, idPath);/* w ww . j av a2s . c o m*/ openIdsMapping(HashPagesDriver.IDS_FILE2, job.get("mapred.task.partition")); }
From source file:edu.umd.cloud9.collection.trec.BuildTrecForwardIndex.java
License:Apache License
/** * Runs this tool.//from w w w . j ava 2 s . c om */ public int run(String[] args) throws Exception { if (args.length != 4) { printUsage(); return -1; } JobConf conf = new JobConf(getConf(), BuildTrecForwardIndex.class); FileSystem fs = FileSystem.get(getConf()); String collectionPath = args[0]; String outputPath = args[1]; String indexFile = args[2]; String mappingFile = args[3]; LOG.info("Tool name: " + BuildTrecForwardIndex.class.getCanonicalName()); LOG.info(" - collection path: " + collectionPath); LOG.info(" - output path: " + outputPath); LOG.info(" - index file: " + indexFile); LOG.info(" - mapping file: " + mappingFile); conf.setJobName(BuildTrecForwardIndex.class.getSimpleName()); conf.set("mapred.child.java.opts", "-Xmx1024m"); conf.setNumReduceTasks(1); if (conf.get("mapred.job.tracker").equals("local")) { conf.set(DOCNO_MAPPING_FILE_PROPERTY, mappingFile); } else { DistributedCache.addCacheFile(new URI(mappingFile), conf); } FileInputFormat.setInputPaths(conf, new Path(collectionPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); FileOutputFormat.setCompressOutput(conf, false); conf.setInputFormat(TrecDocumentInputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(IdentityReducer.class); // delete the output directory if it exists already FileSystem.get(conf).delete(new Path(outputPath), true); RunningJob job = JobClient.runJob(conf); Counters counters = job.getCounters(); int numDocs = (int) counters.findCounter(Count.DOCS).getCounter(); String inputFile = outputPath + "/" + "part-00000"; LOG.info("Writing " + numDocs + " doc offseta to " + indexFile); FSLineReader reader = new FSLineReader(inputFile, fs); FSDataOutputStream writer = fs.create(new Path(indexFile), true); writer.writeUTF(edu.umd.cloud9.collection.trec.TrecForwardIndex.class.getCanonicalName()); writer.writeUTF(collectionPath); writer.writeInt(numDocs); int cnt = 0; Text line = new Text(); while (reader.readLine(line) > 0) { String[] arr = line.toString().split("\\t"); long offset = Long.parseLong(arr[1]); int len = Integer.parseInt(arr[2]); writer.writeLong(offset); writer.writeInt(len); cnt++; if (cnt % 100000 == 0) { LOG.info(cnt + " docs"); } } reader.close(); writer.close(); LOG.info(cnt + " docs total. Done!"); if (numDocs != cnt) { throw new RuntimeException("Unexpected number of documents in building forward index!"); } return 0; }
From source file:edu.umd.cloud9.webgraph.BuildReverseWebGraph.java
License:Apache License
public int runTool() throws Exception { JobConf conf = new JobConf(getConf(), BuildReverseWebGraph.class); FileSystem fs = FileSystem.get(conf); int numMappers = conf.getInt("Cloud9.Mappers", 1); int numReducers = conf.getInt("Cloud9.Reducers", 200); String inputPath = conf.get("Cloud9.InputPath"); String outputPath = conf.get("Cloud9.OutputPath"); conf.setJobName("ReverseWebGraph"); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInt("mapred.task.timeout", 60000000); conf.set("mapreduce.map.memory.mb", "2048"); conf.set("mapreduce.map.java.opts", "-Xmx2048m"); conf.set("mapreduce.reduce.memory.mb", "2048"); conf.set("mapreduce.reduce.java.opts", "-Xmx2048m"); conf.set("mapreduce.task.timeout", "60000000"); conf.setNumMapTasks(numMappers);//from w w w . j a v a 2 s .c o m conf.setNumReduceTasks(numReducers); conf.setMapperClass(IdentityMapper.class); conf.setReducerClass(Reduce.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(ArrayListWritable.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(ArrayListWritable.class); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(conf, true); SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK); SequenceFileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); LOG.info("BuildReverseWebGraph"); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); if (!fs.exists(new Path(outputPath))) { JobClient.runJob(conf); } else { LOG.info(outputPath + " already exists! Skipping this step..."); } return 0; }
From source file:edu.umd.cloud9.webgraph.BuildWebGraph.java
License:Apache License
public int runTool() throws Exception { JobConf conf = new JobConf(getConf(), BuildWebGraph.class); FileSystem fs = FileSystem.get(conf); int numMappers = conf.getInt("Cloud9.Mappers", 1); int numReducers = conf.getInt("Cloud9.Reducers", 200); String inputPath = conf.get("Cloud9.InputPath"); String outputPath = conf.get("Cloud9.OutputPath"); conf.setJobName("ConstructWebGraph"); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInt("mapred.task.timeout", 60000000); conf.set("mapreduce.map.memory.mb", "2048"); conf.set("mapreduce.map.java.opts", "-Xmx2048m"); conf.set("mapreduce.reduce.memory.mb", "2048"); conf.set("mapreduce.reduce.java.opts", "-Xmx2048m"); conf.set("mapreduce.task.timeout", "60000000"); conf.setNumMapTasks(numMappers);/*from w w w. java2 s.co m*/ conf.setNumReduceTasks(numReducers); conf.setMapperClass(Map.class); conf.setReducerClass(Reduce.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(ArrayListWritable.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(ArrayListWritable.class); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(conf, true); SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK); SequenceFileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); LOG.info("BuildWebGraph"); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); if (!fs.exists(new Path(outputPath))) { JobClient.runJob(conf); } else { LOG.info(outputPath + " already exists! Skipping this step..."); } return 0; }