List of usage examples for org.apache.hadoop.mapred JobConf setInputFormat
public void setInputFormat(Class<? extends InputFormat> theClass)
From source file:io.fluo.stress.trie.Unique.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length < 1) { log.error("Usage: " + this.getClass().getSimpleName() + "<input dir>{ <input dir>}"); System.exit(-1);//from w ww .j av a 2 s .c om } JobConf job = new JobConf(getConf()); job.setJobName(Unique.class.getName()); job.setJarByClass(Unique.class); job.setInputFormat(SequenceFileInputFormat.class); for (String arg : args) { SequenceFileInputFormat.addInputPath(job, new Path(arg)); } job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(NullWritable.class); job.setReducerClass(UniqueReducer.class); job.setOutputFormat(NullOutputFormat.class); RunningJob runningJob = JobClient.runJob(job); runningJob.waitForCompletion(); numUnique = (int) runningJob.getCounters().getCounter(Stats.UNIQUE); log.debug("numUnique : " + numUnique); return runningJob.isSuccessful() ? 0 : -1; }
From source file:io.hops.erasure_coding.MapReduceEncoder.java
License:Apache License
/** * create new job conf based on configuration passed. * * @param conf/* ww w.j av a 2s. c o m*/ * @return */ private static JobConf createJobConf(Configuration conf) { JobConf jobconf = new JobConf(conf, MapReduceEncoder.class); jobName = NAME + " " + dateForm.format(new Date(BaseEncodingManager.now())); jobconf.setUser(BaseEncodingManager.JOBUSER); jobconf.setJobName(jobName); jobconf.setMapSpeculativeExecution(false); RaidUtils.parseAndSetOptions(jobconf, SCHEDULER_OPTION_LABEL); jobconf.setJarByClass(MapReduceEncoder.class); jobconf.setInputFormat(DistRaidInputFormat.class); jobconf.setOutputKeyClass(Text.class); jobconf.setOutputValueClass(Text.class); jobconf.setMapperClass(DistRaidMapper.class); jobconf.setNumReduceTasks(0); return jobconf; }
From source file:io.prestosql.plugin.hive.BackgroundHiveSplitLoader.java
License:Apache License
private ListenableFuture<?> loadPartition(HivePartitionMetadata partition) throws IOException { String partitionName = partition.getHivePartition().getPartitionId(); Properties schema = getPartitionSchema(table, partition.getPartition()); List<HivePartitionKey> partitionKeys = getPartitionKeys(table, partition.getPartition()); TupleDomain<HiveColumnHandle> effectivePredicate = (TupleDomain<HiveColumnHandle>) compactEffectivePredicate; Path path = new Path(getPartitionLocation(table, partition.getPartition())); Configuration configuration = hdfsEnvironment.getConfiguration(hdfsContext, path); InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, false); FileSystem fs = hdfsEnvironment.getFileSystem(hdfsContext, path); boolean s3SelectPushdownEnabled = shouldEnablePushdownForTable(session, table, path.toString(), partition.getPartition());/* www . j a v a 2 s .co m*/ if (inputFormat instanceof SymlinkTextInputFormat) { if (tableBucketInfo.isPresent()) { throw new PrestoException(NOT_SUPPORTED, "Bucketed table in SymlinkTextInputFormat is not yet supported"); } // TODO: This should use an iterator like the HiveFileIterator ListenableFuture<?> lastResult = COMPLETED_FUTURE; for (Path targetPath : getTargetPathsFromSymlink(fs, path)) { // The input should be in TextInputFormat. TextInputFormat targetInputFormat = new TextInputFormat(); // the splits must be generated using the file system for the target path // get the configuration for the target path -- it may be a different hdfs instance FileSystem targetFilesystem = hdfsEnvironment.getFileSystem(hdfsContext, targetPath); JobConf targetJob = toJobConf(targetFilesystem.getConf()); targetJob.setInputFormat(TextInputFormat.class); targetInputFormat.configure(targetJob); FileInputFormat.setInputPaths(targetJob, targetPath); InputSplit[] targetSplits = targetInputFormat.getSplits(targetJob, 0); InternalHiveSplitFactory splitFactory = new InternalHiveSplitFactory(targetFilesystem, partitionName, inputFormat, schema, partitionKeys, effectivePredicate, partition.getColumnCoercions(), Optional.empty(), isForceLocalScheduling(session), s3SelectPushdownEnabled); lastResult = addSplitsToSource(targetSplits, splitFactory); if (stopped) { return COMPLETED_FUTURE; } } return lastResult; } Optional<BucketConversion> bucketConversion = Optional.empty(); boolean bucketConversionRequiresWorkerParticipation = false; if (partition.getPartition().isPresent()) { Optional<HiveBucketProperty> partitionBucketProperty = partition.getPartition().get().getStorage() .getBucketProperty(); if (tableBucketInfo.isPresent() && partitionBucketProperty.isPresent()) { int readBucketCount = tableBucketInfo.get().getReadBucketCount(); int partitionBucketCount = partitionBucketProperty.get().getBucketCount(); // Validation was done in HiveSplitManager#getPartitionMetadata. // Here, it's just trying to see if its needs the BucketConversion. if (readBucketCount != partitionBucketCount) { bucketConversion = Optional.of(new BucketConversion(readBucketCount, partitionBucketCount, tableBucketInfo.get().getBucketColumns())); if (readBucketCount > partitionBucketCount) { bucketConversionRequiresWorkerParticipation = true; } } } } InternalHiveSplitFactory splitFactory = new InternalHiveSplitFactory(fs, partitionName, inputFormat, schema, partitionKeys, effectivePredicate, partition.getColumnCoercions(), bucketConversionRequiresWorkerParticipation ? bucketConversion : Optional.empty(), isForceLocalScheduling(session), s3SelectPushdownEnabled); // To support custom input formats, we want to call getSplits() // on the input format to obtain file splits. if (shouldUseFileSplitsFromInputFormat(inputFormat)) { if (tableBucketInfo.isPresent()) { throw new PrestoException(NOT_SUPPORTED, "Presto cannot read bucketed partition in an input format with UseFileSplitsFromInputFormat annotation: " + inputFormat.getClass().getSimpleName()); } JobConf jobConf = toJobConf(configuration); FileInputFormat.setInputPaths(jobConf, path); InputSplit[] splits = inputFormat.getSplits(jobConf, 0); return addSplitsToSource(splits, splitFactory); } // Bucketed partitions are fully loaded immediately since all files must be loaded to determine the file to bucket mapping if (tableBucketInfo.isPresent()) { return hiveSplitSource .addToQueue(getBucketedSplits(path, fs, splitFactory, tableBucketInfo.get(), bucketConversion)); } // S3 Select pushdown works at the granularity of individual S3 objects, // therefore we must not split files when it is enabled. boolean splittable = getHeaderCount(schema) == 0 && getFooterCount(schema) == 0 && !s3SelectPushdownEnabled; fileIterators.addLast(createInternalHiveSplitIterator(path, fs, splitFactory, splittable)); return COMPLETED_FUTURE; }
From source file:it.crs4.pydoop.pipes.Submitter.java
License:Apache License
private static void setupPipesJob(JobConf conf) throws IOException { // default map output types to Text if (!getIsJavaMapper(conf)) { conf.setMapRunnerClass(PipesMapRunner.class); // Save the user's partitioner and hook in our's. setJavaPartitioner(conf, conf.getPartitionerClass()); conf.setPartitionerClass(PipesPartitioner.class); }//w w w . j av a2 s. c o m if (!getIsJavaReducer(conf)) { conf.setReducerClass(PipesReducer.class); if (!getIsJavaRecordWriter(conf)) { conf.setOutputFormat(NullOutputFormat.class); } } String textClassname = Text.class.getName(); setIfUnset(conf, MRJobConfig.MAP_OUTPUT_KEY_CLASS, textClassname); setIfUnset(conf, MRJobConfig.MAP_OUTPUT_VALUE_CLASS, textClassname); setIfUnset(conf, MRJobConfig.OUTPUT_KEY_CLASS, textClassname); setIfUnset(conf, MRJobConfig.OUTPUT_VALUE_CLASS, textClassname); // Use PipesNonJavaInputFormat if necessary to handle progress reporting // from C++ RecordReaders ... if (!getIsJavaRecordReader(conf) && !getIsJavaMapper(conf)) { conf.setClass(Submitter.INPUT_FORMAT, conf.getInputFormat().getClass(), InputFormat.class); conf.setInputFormat(PipesNonJavaInputFormat.class); } String exec = getExecutable(conf); if (exec == null) { throw new IllegalArgumentException("No application program defined."); } // add default debug script only when executable is expressed as // <path>#<executable> if (exec.contains("#")) { // set default gdb commands for map and reduce task String defScript = "$HADOOP_PREFIX/src/c++/pipes/debug/pipes-default-script"; setIfUnset(conf, MRJobConfig.MAP_DEBUG_SCRIPT, defScript); setIfUnset(conf, MRJobConfig.REDUCE_DEBUG_SCRIPT, defScript); } URI[] fileCache = DistributedCache.getCacheFiles(conf); if (fileCache == null) { fileCache = new URI[1]; } else { URI[] tmp = new URI[fileCache.length + 1]; System.arraycopy(fileCache, 0, tmp, 1, fileCache.length); fileCache = tmp; } try { fileCache[0] = new URI(exec); } catch (URISyntaxException e) { IOException ie = new IOException("Problem parsing execable URI " + exec); ie.initCause(e); throw ie; } DistributedCache.setCacheFiles(fileCache, conf); }
From source file:it.crs4.pydoop.pipes.Submitter.java
License:Apache License
@Override public int run(String[] args) throws Exception { CommandLineParser cli = new CommandLineParser(); if (args.length == 0) { cli.printUsage();//from w w w .j a v a2 s.c o m return 1; } cli.addOption("input", false, "input path to the maps", "path"); cli.addOption("output", false, "output path from the reduces", "path"); cli.addOption("jar", false, "job jar file", "path"); cli.addOption("inputformat", false, "java classname of InputFormat", "class"); //cli.addArgument("javareader", false, "is the RecordReader in Java"); cli.addOption("map", false, "java classname of Mapper", "class"); cli.addOption("partitioner", false, "java classname of Partitioner", "class"); cli.addOption("reduce", false, "java classname of Reducer", "class"); cli.addOption("writer", false, "java classname of OutputFormat", "class"); cli.addOption("program", false, "URI to application executable", "class"); cli.addOption("reduces", false, "number of reduces", "num"); cli.addOption("jobconf", false, "\"n1=v1,n2=v2,..\" (Deprecated) Optional. Add or override a JobConf property.", "key=val"); cli.addOption("lazyOutput", false, "Optional. Create output lazily", "boolean"); Parser parser = cli.createParser(); try { GenericOptionsParser genericParser = new GenericOptionsParser(getConf(), args); CommandLine results = parser.parse(cli.options, genericParser.getRemainingArgs()); JobConf job = new JobConf(getConf()); if (results.hasOption("input")) { FileInputFormat.setInputPaths(job, results.getOptionValue("input")); } if (results.hasOption("output")) { FileOutputFormat.setOutputPath(job, new Path(results.getOptionValue("output"))); } if (results.hasOption("jar")) { job.setJar(results.getOptionValue("jar")); } if (results.hasOption("inputformat")) { setIsJavaRecordReader(job, true); job.setInputFormat(getClass(results, "inputformat", job, InputFormat.class)); } if (results.hasOption("javareader")) { setIsJavaRecordReader(job, true); } if (results.hasOption("map")) { setIsJavaMapper(job, true); job.setMapperClass(getClass(results, "map", job, Mapper.class)); } if (results.hasOption("partitioner")) { job.setPartitionerClass(getClass(results, "partitioner", job, Partitioner.class)); } if (results.hasOption("reduce")) { setIsJavaReducer(job, true); job.setReducerClass(getClass(results, "reduce", job, Reducer.class)); } if (results.hasOption("reduces")) { job.setNumReduceTasks(Integer.parseInt(results.getOptionValue("reduces"))); } if (results.hasOption("writer")) { setIsJavaRecordWriter(job, true); job.setOutputFormat(getClass(results, "writer", job, OutputFormat.class)); } if (results.hasOption("lazyOutput")) { if (Boolean.parseBoolean(results.getOptionValue("lazyOutput"))) { LazyOutputFormat.setOutputFormatClass(job, job.getOutputFormat().getClass()); } } if (results.hasOption("program")) { setExecutable(job, results.getOptionValue("program")); } if (results.hasOption("jobconf")) { LOG.warn("-jobconf option is deprecated, please use -D instead."); String options = results.getOptionValue("jobconf"); StringTokenizer tokenizer = new StringTokenizer(options, ","); while (tokenizer.hasMoreTokens()) { String keyVal = tokenizer.nextToken().trim(); String[] keyValSplit = keyVal.split("="); job.set(keyValSplit[0], keyValSplit[1]); } } // if they gave us a jar file, include it into the class path String jarFile = job.getJar(); if (jarFile != null) { final URL[] urls = new URL[] { FileSystem.getLocal(job).pathToFile(new Path(jarFile)).toURL() }; //FindBugs complains that creating a URLClassLoader should be //in a doPrivileged() block. ClassLoader loader = AccessController.doPrivileged(new PrivilegedAction<ClassLoader>() { public ClassLoader run() { return new URLClassLoader(urls); } }); job.setClassLoader(loader); } runJob(job); return 0; } catch (ParseException pe) { LOG.info("Error : " + pe); cli.printUsage(); return 1; } }
From source file:ivory.core.index.MergeGlobalStatsAcrossIndexSegments.java
License:Apache License
public int runTool() throws Exception { JobConf conf = new JobConf(getConf(), MergeGlobalStatsAcrossIndexSegments.class); FileSystem fs = FileSystem.get(conf); String collectionName = conf.get("Ivory.CollectionName"); String indexPaths = conf.get("Ivory.IndexPaths"); String dataOutputPath = conf.get("Ivory.DataOutputPath"); int dfThreshold = conf.getInt("Ivory.DfThreshold", 0); // first, compute size of global term space Path tmpPaths = new Path("/tmp/index-paths.txt"); FSDataOutputStream out = fs.create(tmpPaths, true); for (String s : indexPaths.split(",")) { out.write(new String(s + "\n").getBytes()); }/* ww w . j ava2 s .c om*/ out.close(); LOG.info("Job: ComputeNumberOfTermsAcrossIndexSegments"); conf.setJobName("ComputeNumberOfTermsAcrossIndexSegments:" + collectionName); FileInputFormat.addInputPath(conf, tmpPaths); conf.setNumMapTasks(1); conf.setNumReduceTasks(1); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInputFormat(NLineInputFormat.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(PairOfIntLong.class); conf.setOutputFormat(NullOutputFormat.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(IdentityReducer.class); long startTime = System.currentTimeMillis(); RunningJob job = JobClient.runJob(conf); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); Counters counters = job.getCounters(); long totalNumTerms = counters.findCounter("org.apache.hadoop.mapred.Task$Counter", 6, "REDUCE_INPUT_GROUPS") .getCounter(); LOG.info("total number of terms in global dictionary = " + totalNumTerms); // now build the dictionary fs.delete(new Path(dataOutputPath), true); conf = new JobConf(getConf(), MergeGlobalStatsAcrossIndexSegments.class); LOG.info("Job: MergeGlobalStatsAcrossIndexSegments"); conf.setJobName("MergeGlobalStatsAcrossIndexSegments:" + collectionName); FileInputFormat.addInputPath(conf, tmpPaths); conf.setNumMapTasks(1); conf.setNumReduceTasks(1); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInputFormat(NLineInputFormat.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(PairOfIntLong.class); conf.setOutputFormat(NullOutputFormat.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(MyReducer.class); conf.setLong("Ivory.IndexNumberOfTerms", (int) totalNumTerms); startTime = System.currentTimeMillis(); job = JobClient.runJob(conf); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); // compute some # docs, collection length, avg doc length long collectionLength = 0; int docCount = 0; for (String index : indexPaths.split(",")) { LOG.info("reading stats for " + index); RetrievalEnvironment env = new RetrievalEnvironment(index, fs); long l = env.readCollectionLength(); int n = env.readCollectionDocumentCount(); LOG.info(" - CollectionLength: " + l); LOG.info(" - CollectionDocumentCount: " + n); collectionLength += l; docCount += n; } float avgdl = (float) collectionLength / docCount; LOG.info("all index segments: "); LOG.info(" - CollectionLength: " + collectionLength); LOG.info(" - CollectionDocumentCount: " + docCount); LOG.info(" - AverageDocumentLenght: " + avgdl); RetrievalEnvironment env = new RetrievalEnvironment(dataOutputPath, fs); env.writeCollectionAverageDocumentLength(avgdl); env.writeCollectionLength(collectionLength); env.writeCollectionDocumentCount(docCount); return 0; }
From source file:ivory.core.preprocess.BuildTargetLangWeightedIntDocVectors.java
License:Apache License
@SuppressWarnings("deprecation") public int runTool() throws Exception { // sLogger.setLevel(Level.DEBUG); sLogger.info("PowerTool: GetTargetLangWeightedIntDocVectors"); JobConf conf = new JobConf(BuildTargetLangWeightedIntDocVectors.class); FileSystem fs = FileSystem.get(conf); String indexPath = getConf().get("Ivory.IndexPath"); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); String outputPath = env.getWeightedIntDocVectorsDirectory(); int mapTasks = getConf().getInt("Ivory.NumMapTasks", 0); int minSplitSize = getConf().getInt("Ivory.MinSplitSize", 0); String collectionName = getConf().get("Ivory.CollectionName"); sLogger.info("Characteristics of the collection:"); sLogger.info(" - CollectionName: " + collectionName); sLogger.info("Characteristics of the job:"); sLogger.info(" - NumMapTasks: " + mapTasks); sLogger.info(" - MinSplitSize: " + minSplitSize); String vocabFile = getConf().get("Ivory.FinalVocab"); DistributedCache.addCacheFile(new URI(vocabFile), conf); Path inputPath = new Path(PwsimEnvironment.getFileNameWithPars(indexPath, "TermDocs")); Path weightedVectorsPath = new Path(outputPath); if (fs.exists(weightedVectorsPath)) { sLogger.info("Output path already exists!"); return -1; }//www . ja va 2s.co m conf.setJobName("GetWeightedIntDocVectors:" + collectionName); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(0); conf.setInt("mapred.min.split.size", minSplitSize); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setBoolean("Ivory.Normalize", getConf().getBoolean("Ivory.Normalize", false)); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, weightedVectorsPath); conf.setInputFormat(SequenceFileInputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(WeightedIntDocVector.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(WeightedIntDocVector.class); conf.setMapperClass(MyMapper.class); long startTime = System.currentTimeMillis(); RunningJob rj = JobClient.runJob(conf); sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); Counters counters = rj.getCounters(); long numOfDocs = (long) counters.findCounter(Docs.Total).getCounter(); return (int) numOfDocs; }
From source file:ivory.core.preprocess.BuildWeightedIntDocVectors.java
License:Apache License
@SuppressWarnings("deprecation") public int runTool() throws Exception { sLogger.setLevel(Level.WARN); sLogger.info("PowerTool: GetWeightedIntDocVectors"); // create a new JobConf, inheriting from the configuration of this // PowerTool/*from w w w . jav a 2 s .c o m*/ JobConf conf = new JobConf(getConf(), BuildWeightedIntDocVectors.class); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get("Ivory.IndexPath"); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); String outputPath = env.getWeightedIntDocVectorsDirectory(); int mapTasks = conf.getInt("Ivory.NumMapTasks", 0); int minSplitSize = conf.getInt("Ivory.MinSplitSize", 0); String collectionName = conf.get("Ivory.CollectionName"); sLogger.info("Characteristics of the collection:"); sLogger.info(" - CollectionName: " + collectionName); sLogger.info("Characteristics of the job:"); sLogger.info(" - NumMapTasks: " + mapTasks); sLogger.info(" - MinSplitSize: " + minSplitSize); String dfByIntFilePath = env.getDfByIntData(); String cfByIntFilePath = env.getCfByIntData(); /* add df table to cache */ if (!fs.exists(new Path(dfByIntFilePath))) { throw new RuntimeException("Error, df data file " + dfByIntFilePath + "doesn't exist!"); } DistributedCache.addCacheFile(new URI(dfByIntFilePath), conf); /* add cf table to cache */ if (!fs.exists(new Path(cfByIntFilePath))) { throw new RuntimeException("Error, cf data file " + cfByIntFilePath + "doesn't exist!"); } DistributedCache.addCacheFile(new URI(cfByIntFilePath), conf); /* add dl table to cache */ Path docLengthFile = env.getDoclengthsData(); if (!fs.exists(docLengthFile)) { throw new RuntimeException("Error, doc-length data file " + docLengthFile + "doesn't exist!"); } DistributedCache.addCacheFile(docLengthFile.toUri(), conf); Path inputPath = new Path(env.getIntDocVectorsDirectory()); Path weightedVectorsPath = new Path(outputPath); if (fs.exists(weightedVectorsPath)) { sLogger.info("Output path already exists!"); return 0; } //fs.delete(weightedVectirsPath, true); conf.setJobName("GetWeightedIntDocVectors:" + collectionName); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(0); conf.setInt("mapred.min.split.size", minSplitSize); conf.set("mapred.child.java.opts", "-Xmx2048m"); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, weightedVectorsPath); conf.setInputFormat(SequenceFileInputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(WeightedIntDocVector.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(WeightedIntDocVector.class); conf.setMapperClass(MyMapper.class); //conf.setInt("mapred.task.timeout",3600000); long startTime = System.currentTimeMillis(); RunningJob job = JobClient.runJob(conf); sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:ivory.core.preprocess.BuildWeightedTermDocVectors.java
License:Apache License
@SuppressWarnings("deprecation") public int runTool() throws Exception { sLogger.info("PowerTool: GetWeightedTermDocVectors"); JobConf conf = new JobConf(BuildWeightedTermDocVectors.class); FileSystem fs = FileSystem.get(conf); String indexPath = getConf().get("Ivory.IndexPath"); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); String outputPath = env.getWeightedTermDocVectorsDirectory(); int mapTasks = getConf().getInt("Ivory.NumMapTasks", 0); int minSplitSize = getConf().getInt("Ivory.MinSplitSize", 0); String collectionName = getConf().get("Ivory.CollectionName"); String termsFilePath = env.getIndexTermsData(); String termsIdsFilePath = env.getIndexTermIdsData(); String termIdMappingFilePath = env.getIndexTermIdMappingData(); String dfByTermFilePath = env.getDfByTermData(); Path inputPath = new Path(env.getTermDocVectorsDirectory()); Path weightedVectorsPath = new Path(outputPath); if (fs.exists(weightedVectorsPath)) { //fs.delete(weightedVectorsPath, true); sLogger.info("Output path already exists!"); return 0; }// w w w . ja v a 2 s . com /* add terms file to cache */ if (!fs.exists(new Path(termsFilePath)) || !fs.exists(new Path(termsIdsFilePath)) || !fs.exists(new Path(termIdMappingFilePath))) { throw new RuntimeException("Error, terms file " + termsFilePath + "/" + termsIdsFilePath + "/" + termIdMappingFilePath + "doesn't exist!"); } DistributedCache.addCacheFile(new URI(termsFilePath), conf); DistributedCache.addCacheFile(new URI(termsIdsFilePath), conf); DistributedCache.addCacheFile(new URI(termIdMappingFilePath), conf); /* add df table to cache */ if (!fs.exists(new Path(dfByTermFilePath))) { throw new RuntimeException("Error, df data file " + dfByTermFilePath + "doesn't exist!"); } DistributedCache.addCacheFile(new URI(dfByTermFilePath), conf); /* add dl table to cache */ Path docLengthFile = env.getDoclengthsData(); if (!fs.exists(docLengthFile)) { throw new RuntimeException("Error, doc-length data file " + docLengthFile + "doesn't exist!"); } DistributedCache.addCacheFile(docLengthFile.toUri(), conf); conf.setMapperClass(MyMapper.class); //conf.setInt("mapred.task.timeout",3600000); conf.setJobName("GetWeightedTermDocVectors:" + collectionName); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(0); conf.setInt("mapred.min.split.size", minSplitSize); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInt("Ivory.MinNumTerms", getConf().getInt("Ivory.MinNumTerms", Integer.MAX_VALUE)); conf.setBoolean("Ivory.Normalize", getConf().getBoolean("Ivory.Normalize", false)); if (getConf().get("Ivory.ShortDocLengths") != null) { conf.set("Ivory.ShortDocLengths", getConf().get("Ivory.ShortDocLengths")); } conf.set("Ivory.ScoringModel", getConf().get("Ivory.ScoringModel")); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, weightedVectorsPath); conf.setInputFormat(SequenceFileInputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(HMapSFW.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(HMapSFW.class); sLogger.info("Running job: " + conf.getJobName()); long startTime = System.currentTimeMillis(); RunningJob job = JobClient.runJob(conf); sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:ivory.index.BuildIntPostingsForwardIndex.java
License:Apache License
public int runTool() throws Exception { JobConf conf = new JobConf(getConf(), BuildIntPostingsForwardIndex.class); FileSystem fs = FileSystem.get(conf); int mapTasks = conf.getInt("Ivory.NumMapTasks", 0); int minSplitSize = conf.getInt("Ivory.MinSplitSize", 0); String indexPath = conf.get("Ivory.IndexPath"); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); String collectionName = env.readCollectionName(); sLogger.info("Tool: BuildIntPostingsForwardIndex"); sLogger.info(" - IndexPath: " + indexPath); sLogger.info(" - CollectionName: " + collectionName); conf.setJobName("BuildIntPostingsForwardIndex:" + collectionName); Path inputPath = new Path(env.getPostingsDirectory()); FileInputFormat.setInputPaths(conf, inputPath); Path postingsIndexPath = new Path(env.getPostingsIndexData()); if (fs.exists(postingsIndexPath)) { sLogger.info("Postings forward index path already exists!"); return 0; }/* w ww . j a va 2s.co m*/ conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(1); conf.setInt("mapred.min.split.size", minSplitSize); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInputFormat(SequenceFileInputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(Text.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setOutputFormat(NullOutputFormat.class); conf.setMapRunnerClass(MyMapRunner.class); conf.setReducerClass(MyReducer.class); JobClient.runJob(conf); return 0; }