List of usage examples for org.apache.hadoop.mapred JobConf set
public void set(String name, String value)
value
of the name
property. From source file:io.prestosql.rcfile.RcFileTester.java
License:Apache License
private static <K extends LongWritable, V extends BytesRefArrayWritable> void assertFileContentsOld(Type type, TempFile tempFile, Format format, Iterable<?> expectedValues) throws Exception { JobConf configuration = new JobConf(new Configuration(false)); configuration.set(READ_COLUMN_IDS_CONF_STR, "0"); configuration.setBoolean(READ_ALL_COLUMNS, false); Properties schema = new Properties(); schema.setProperty(META_TABLE_COLUMNS, "test"); schema.setProperty(META_TABLE_COLUMN_TYPES, getJavaObjectInspector(type).getTypeName()); Deserializer deserializer;// w w w . j a v a 2 s .co m if (format == Format.BINARY) { deserializer = new LazyBinaryColumnarSerDe(); } else { deserializer = new ColumnarSerDe(); } deserializer.initialize(configuration, schema); configuration.set(SERIALIZATION_LIB, deserializer.getClass().getName()); InputFormat<K, V> inputFormat = new RCFileInputFormat<>(); RecordReader<K, V> recordReader = inputFormat .getRecordReader(new FileSplit(new Path(tempFile.getFile().getAbsolutePath()), 0, tempFile.getFile().length(), (String[]) null), configuration, NULL); K key = recordReader.createKey(); V value = recordReader.createValue(); StructObjectInspector rowInspector = (StructObjectInspector) deserializer.getObjectInspector(); StructField field = rowInspector.getStructFieldRef("test"); Iterator<?> iterator = expectedValues.iterator(); while (recordReader.next(key, value)) { Object expectedValue = iterator.next(); Object rowData = deserializer.deserialize(value); Object actualValue = rowInspector.getStructFieldData(rowData, field); actualValue = decodeRecordReaderValue(type, actualValue); assertColumnValueEquals(type, actualValue, expectedValue); } assertFalse(iterator.hasNext()); }
From source file:io.prestosql.rcfile.RcFileTester.java
License:Apache License
private static RecordWriter createRcFileWriterOld(File outputFile, Compression compression, ObjectInspector columnObjectInspector) throws IOException { JobConf jobConf = new JobConf(false); Optional<String> codecName = compression.getCodecName(); codecName.ifPresent(s -> jobConf.set(COMPRESS_CODEC, s)); return new RCFileOutputFormat().getHiveRecordWriter(jobConf, new Path(outputFile.toURI()), Text.class, codecName.isPresent(), createTableProperties("test", columnObjectInspector.getTypeName()), () -> { });/* w ww .ja va2s . co m*/ }
From source file:it.crs4.pydoop.pipes.Submitter.java
License:Apache License
/** * Set the URI for the application's executable. Normally this is a hdfs: * location./*from ww w . j ava 2 s . c o m*/ * @param conf * @param executable The URI of the application's executable. */ public static void setExecutable(JobConf conf, String executable) { conf.set(Submitter.EXECUTABLE, executable); }
From source file:it.crs4.pydoop.pipes.Submitter.java
License:Apache License
/** * Set the configuration, if it doesn't already have a value for the given * key.//from w w w . jav a 2 s . c om * @param conf the configuration to modify * @param key the key to set * @param value the new "default" value to set */ private static void setIfUnset(JobConf conf, String key, String value) { if (conf.get(key) == null) { conf.set(key, value); } }
From source file:it.crs4.pydoop.pipes.Submitter.java
License:Apache License
/** * Save away the user's original partitioner before we override it. * @param conf the configuration to modify * @param cls the user's partitioner class *///from w ww . j a v a 2s . co m static void setJavaPartitioner(JobConf conf, Class cls) { conf.set(Submitter.PARTITIONER, cls.getName()); }
From source file:it.crs4.pydoop.pipes.Submitter.java
License:Apache License
@Override public int run(String[] args) throws Exception { CommandLineParser cli = new CommandLineParser(); if (args.length == 0) { cli.printUsage();/* ww w .j a va 2s .c o m*/ return 1; } cli.addOption("input", false, "input path to the maps", "path"); cli.addOption("output", false, "output path from the reduces", "path"); cli.addOption("jar", false, "job jar file", "path"); cli.addOption("inputformat", false, "java classname of InputFormat", "class"); //cli.addArgument("javareader", false, "is the RecordReader in Java"); cli.addOption("map", false, "java classname of Mapper", "class"); cli.addOption("partitioner", false, "java classname of Partitioner", "class"); cli.addOption("reduce", false, "java classname of Reducer", "class"); cli.addOption("writer", false, "java classname of OutputFormat", "class"); cli.addOption("program", false, "URI to application executable", "class"); cli.addOption("reduces", false, "number of reduces", "num"); cli.addOption("jobconf", false, "\"n1=v1,n2=v2,..\" (Deprecated) Optional. Add or override a JobConf property.", "key=val"); cli.addOption("lazyOutput", false, "Optional. Create output lazily", "boolean"); Parser parser = cli.createParser(); try { GenericOptionsParser genericParser = new GenericOptionsParser(getConf(), args); CommandLine results = parser.parse(cli.options, genericParser.getRemainingArgs()); JobConf job = new JobConf(getConf()); if (results.hasOption("input")) { FileInputFormat.setInputPaths(job, results.getOptionValue("input")); } if (results.hasOption("output")) { FileOutputFormat.setOutputPath(job, new Path(results.getOptionValue("output"))); } if (results.hasOption("jar")) { job.setJar(results.getOptionValue("jar")); } if (results.hasOption("inputformat")) { setIsJavaRecordReader(job, true); job.setInputFormat(getClass(results, "inputformat", job, InputFormat.class)); } if (results.hasOption("javareader")) { setIsJavaRecordReader(job, true); } if (results.hasOption("map")) { setIsJavaMapper(job, true); job.setMapperClass(getClass(results, "map", job, Mapper.class)); } if (results.hasOption("partitioner")) { job.setPartitionerClass(getClass(results, "partitioner", job, Partitioner.class)); } if (results.hasOption("reduce")) { setIsJavaReducer(job, true); job.setReducerClass(getClass(results, "reduce", job, Reducer.class)); } if (results.hasOption("reduces")) { job.setNumReduceTasks(Integer.parseInt(results.getOptionValue("reduces"))); } if (results.hasOption("writer")) { setIsJavaRecordWriter(job, true); job.setOutputFormat(getClass(results, "writer", job, OutputFormat.class)); } if (results.hasOption("lazyOutput")) { if (Boolean.parseBoolean(results.getOptionValue("lazyOutput"))) { LazyOutputFormat.setOutputFormatClass(job, job.getOutputFormat().getClass()); } } if (results.hasOption("program")) { setExecutable(job, results.getOptionValue("program")); } if (results.hasOption("jobconf")) { LOG.warn("-jobconf option is deprecated, please use -D instead."); String options = results.getOptionValue("jobconf"); StringTokenizer tokenizer = new StringTokenizer(options, ","); while (tokenizer.hasMoreTokens()) { String keyVal = tokenizer.nextToken().trim(); String[] keyValSplit = keyVal.split("="); job.set(keyValSplit[0], keyValSplit[1]); } } // if they gave us a jar file, include it into the class path String jarFile = job.getJar(); if (jarFile != null) { final URL[] urls = new URL[] { FileSystem.getLocal(job).pathToFile(new Path(jarFile)).toURL() }; //FindBugs complains that creating a URLClassLoader should be //in a doPrivileged() block. ClassLoader loader = AccessController.doPrivileged(new PrivilegedAction<ClassLoader>() { public ClassLoader run() { return new URLClassLoader(urls); } }); job.setClassLoader(loader); } runJob(job); return 0; } catch (ParseException pe) { LOG.info("Error : " + pe); cli.printUsage(); return 1; } }
From source file:it.isislab.sof.core.engine.hadoop.sshclient.utils.simulation.executor.SOF.java
License:Apache License
public static void main(String[] args) { /**//from w w w .ja v a 2 s .c o m * aids /home/michele/Scrivania/aids netlogo /home/michele/Scrivania/aids/aids.nlogo /home/michele/Scrivania/aids/input.tmp /home/michele/Scrivania/aids/output /home/michele/Scrivania/aids/output.xml false pepp ciao * */ /* try {//Runtime.getRuntime().exec("rm -r /home/lizard87/Desktop/mason_test/output"); Runtime.getRuntime().exec("rm -r /home/michele/Scrivania/aids/output"); } catch (IOException e) {e.printStackTrace();}*/ if (args.length < 9 || args.length == 11 || args.length == 12 || args.length >= 15) { System.out.println("Usage:"); System.out.println("java -jar SCUD.jar " + "<simulation_name> " + "<simulation_path_home> " + "<simulation_type[mason |netlogo |generic]>" + "<simulation_generic_interpreter_path>" + "<simultion_program_path> " + "<simulation_mapper_input_path> " + "<simulation_mapper_output_path> " + "<simulation_output_domain_xmlfile> " + "<simulation_input_path> " + "<<simulation_rating_path>>" + "<oneshot[one|loop]> " + "<author_name> " + "<simulation_description> " + "<path_interpreter_evaluate_file> " + "<evaluate_file_path>"); System.exit(-1); } Configuration conf = null; JobConf job = null; String AUTHOR = null;/*author name*/ String SIMULATION_NAME = null;/*simulation name*/ String SIMULATION_HOME = null;/*path simulation*/ String SIM_TYPE = null;/*mason, netlogo, generic*/ String SIM_EXECUTABLE_SIMULATION_INTERPRETER_PATH = null; String SIM_EXECUTABLE_SIMULATION_PROGRAM = null; /*executable program *.jar | *.nlogo*/ String SIM_EXECUTION_INPUT_DATA_MAPPER = null;/*input.data path */ String SIM_EXECUTION_OUTPUT_MAPPER = null;/*output loop(i) path*/ String SIM_DESCRIPTION_OUTPUT_XML_DOMAIN = null;/*path of domain file */ String SIM_EXECUTION_INPUT_XML = null;/*execution input path*/ boolean ISLOOP = false;/*false[one] | true[loop]*/ //String DESCRIPTION=null;/*simulations' description*/ String INTERPRETER_REMOTE_PATH_EVALUATION = null;/*remote program bin path for executing EvalFoo*/ String EXECUTABLE_RATING_FILE = null;/*path of rating file*/ String SIM_RATING_PATH = null; // aids /home/michele/Scrivania/aids netlogo /home/michele/Scrivania/aids/aids.nlogo /home/michele/Scrivania/aids/input.tmp /home/michele/Scrivania/aids/output /home/michele/Scrivania/aids/domain.xml /home/michele/Scrivania/aids/input loop pepp ciao /usr/bin/python /home/michele/Scrivania/aids/evaluate.py if (args.length == 13) { SIMULATION_NAME = args[0]; SIMULATION_HOME = args[1]; SIM_TYPE = args[2]; SIM_EXECUTABLE_SIMULATION_PROGRAM = args[3]; SIM_EXECUTION_INPUT_DATA_MAPPER = args[4]; SIM_EXECUTION_OUTPUT_MAPPER = args[5]; SIM_DESCRIPTION_OUTPUT_XML_DOMAIN = args[6]; SIM_EXECUTION_INPUT_XML = args[7]; SIM_RATING_PATH = args[8]; ISLOOP = Boolean.parseBoolean(args[9]); AUTHOR = args[10]; //DESCRIPTION=args[11]; INTERPRETER_REMOTE_PATH_EVALUATION = args[11]; EXECUTABLE_RATING_FILE = args[12]; // System.out.println(DESCRIPTION); //System.out.println(INTERPRETER_REMOTE_PATH_EVALUATION); } else if (args.length == 9) { SIMULATION_NAME = args[0]; SIMULATION_HOME = args[1]; SIM_TYPE = args[2]; SIM_EXECUTABLE_SIMULATION_PROGRAM = args[3]; SIM_EXECUTION_INPUT_DATA_MAPPER = args[4]; SIM_EXECUTION_OUTPUT_MAPPER = args[5]; SIM_DESCRIPTION_OUTPUT_XML_DOMAIN = args[6]; ISLOOP = Boolean.parseBoolean(args[7]); AUTHOR = args[8]; //DESCRIPTION=args[9]; } else if (args.length == 14) { SIMULATION_NAME = args[0]; SIMULATION_HOME = args[1]; SIM_TYPE = args[2]; SIM_EXECUTABLE_SIMULATION_INTERPRETER_PATH = args[3]; SIM_EXECUTABLE_SIMULATION_PROGRAM = args[4]; SIM_EXECUTION_INPUT_DATA_MAPPER = args[5]; SIM_EXECUTION_OUTPUT_MAPPER = args[6]; SIM_DESCRIPTION_OUTPUT_XML_DOMAIN = args[7]; SIM_EXECUTION_INPUT_XML = args[8]; SIM_RATING_PATH = args[9]; ISLOOP = Boolean.parseBoolean(args[10]); AUTHOR = args[11]; // DESCRIPTION=args[12]; INTERPRETER_REMOTE_PATH_EVALUATION = args[12]; EXECUTABLE_RATING_FILE = args[13]; } else if (args.length == 10) { SIMULATION_NAME = args[0]; SIMULATION_HOME = args[1]; SIM_TYPE = args[2]; SIM_EXECUTABLE_SIMULATION_INTERPRETER_PATH = args[3]; SIM_EXECUTABLE_SIMULATION_PROGRAM = args[4]; SIM_EXECUTION_INPUT_DATA_MAPPER = args[5]; SIM_EXECUTION_OUTPUT_MAPPER = args[6]; SIM_DESCRIPTION_OUTPUT_XML_DOMAIN = args[7]; ISLOOP = Boolean.parseBoolean(args[8]); AUTHOR = args[9]; // DESCRIPTION=args[10]; } if (!(SIM_TYPE.equalsIgnoreCase("mason") || SIM_TYPE.equalsIgnoreCase("netlogo") || SIM_TYPE.equalsIgnoreCase("generic"))) { System.exit(-2); } conf = new Configuration(); job = new JobConf(conf, SOF.class); job.setJobName(SIMULATION_NAME/*SIMULATION NAME*/); job.set("simulation.home", SIMULATION_HOME); job.set("simulation.name", SIMULATION_NAME); job.set("simulation.type", SIM_TYPE); if (SIM_TYPE.equalsIgnoreCase("generic")) { job.set("simulation.interpreter.genericsim", SIM_EXECUTABLE_SIMULATION_INTERPRETER_PATH); } job.set("simulation.program.simulation", SIM_EXECUTABLE_SIMULATION_PROGRAM); job.set("simulation.executable.input", SIM_EXECUTION_INPUT_DATA_MAPPER); job.set("simulation.executable.output", SIM_EXECUTION_OUTPUT_MAPPER); job.setBoolean("simulation.executable.mode", ISLOOP); //job.set("simulation.executable.mode", ISLOOP); job.set("simulation.executable.author", AUTHOR); //job.set("simulation.executable.description", DESCRIPTION); job.set("simulation.description.output.domain", SIM_DESCRIPTION_OUTPUT_XML_DOMAIN); /** * GENERA IL .TMP * COMMENTA LA LINEA * TEST IN LOCALE * SOLO PER IL LOCALE */ //XmlToText.convertXmlFileToFileText(conf,"/home/lizard87/Desktop/mason_test/input.xml"); //XmlToText.convertXmlFileToFileText(conf,"/home/lizard87/Desktop/input.xml"); //XmlToText.convertXmlFileToFileText(conf,"/home/lizard87/Desktop/aids/input.xml"); if (ISLOOP) { job.set("simulation.description.input", SIM_EXECUTION_INPUT_XML); job.set("simulation.program.rating", EXECUTABLE_RATING_FILE); //job.set("simulation.interpreter.selection", INTERPRETER_REMOTE_PATH_SELECTION); job.set("simulation.interpreter.rating", INTERPRETER_REMOTE_PATH_EVALUATION); job.set("simulation.executable.loop.rating", SIM_RATING_PATH); } FileInputFormat.addInputPath(job, new Path(SIM_EXECUTION_INPUT_DATA_MAPPER)/*DIRECTORY INPUT*/); FileOutputFormat.setOutputPath(job, new Path(SIM_EXECUTION_OUTPUT_MAPPER)); if (SIM_TYPE.equalsIgnoreCase("mason")) { job.setMapperClass(SOFMapperMason.class); job.setReducerClass(SOFReducerMason.class); } else if (SIM_TYPE.equalsIgnoreCase("netlogo")) { job.setMapperClass(SOFMapperNetLogo.class); job.setReducerClass(SOFReducerNetLogo.class); } else if (SIM_TYPE.equalsIgnoreCase("generic")) { job.setMapperClass(SOFMapperGeneric.class); job.setReducerClass(SOFReducerGeneric.class); } job.setOutputKeyClass(org.apache.hadoop.io.Text.class); job.setOutputValueClass(org.apache.hadoop.io.Text.class); JobClient jobc; try { jobc = new JobClient(job); System.out.println(jobc + " " + job); RunningJob runjob; runjob = JobClient.runJob(job); while (runjob.getJobStatus().equals(JobStatus.SUCCEEDED)) { } System.exit(0); } catch (IOException e) { e.printStackTrace(); } }
From source file:ivory.core.index.MergeGlobalStatsAcrossIndexSegments.java
License:Apache License
public int runTool() throws Exception { JobConf conf = new JobConf(getConf(), MergeGlobalStatsAcrossIndexSegments.class); FileSystem fs = FileSystem.get(conf); String collectionName = conf.get("Ivory.CollectionName"); String indexPaths = conf.get("Ivory.IndexPaths"); String dataOutputPath = conf.get("Ivory.DataOutputPath"); int dfThreshold = conf.getInt("Ivory.DfThreshold", 0); // first, compute size of global term space Path tmpPaths = new Path("/tmp/index-paths.txt"); FSDataOutputStream out = fs.create(tmpPaths, true); for (String s : indexPaths.split(",")) { out.write(new String(s + "\n").getBytes()); }/*from w w w. jav a 2 s . co m*/ out.close(); LOG.info("Job: ComputeNumberOfTermsAcrossIndexSegments"); conf.setJobName("ComputeNumberOfTermsAcrossIndexSegments:" + collectionName); FileInputFormat.addInputPath(conf, tmpPaths); conf.setNumMapTasks(1); conf.setNumReduceTasks(1); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInputFormat(NLineInputFormat.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(PairOfIntLong.class); conf.setOutputFormat(NullOutputFormat.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(IdentityReducer.class); long startTime = System.currentTimeMillis(); RunningJob job = JobClient.runJob(conf); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); Counters counters = job.getCounters(); long totalNumTerms = counters.findCounter("org.apache.hadoop.mapred.Task$Counter", 6, "REDUCE_INPUT_GROUPS") .getCounter(); LOG.info("total number of terms in global dictionary = " + totalNumTerms); // now build the dictionary fs.delete(new Path(dataOutputPath), true); conf = new JobConf(getConf(), MergeGlobalStatsAcrossIndexSegments.class); LOG.info("Job: MergeGlobalStatsAcrossIndexSegments"); conf.setJobName("MergeGlobalStatsAcrossIndexSegments:" + collectionName); FileInputFormat.addInputPath(conf, tmpPaths); conf.setNumMapTasks(1); conf.setNumReduceTasks(1); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInputFormat(NLineInputFormat.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(PairOfIntLong.class); conf.setOutputFormat(NullOutputFormat.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(MyReducer.class); conf.setLong("Ivory.IndexNumberOfTerms", (int) totalNumTerms); startTime = System.currentTimeMillis(); job = JobClient.runJob(conf); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); // compute some # docs, collection length, avg doc length long collectionLength = 0; int docCount = 0; for (String index : indexPaths.split(",")) { LOG.info("reading stats for " + index); RetrievalEnvironment env = new RetrievalEnvironment(index, fs); long l = env.readCollectionLength(); int n = env.readCollectionDocumentCount(); LOG.info(" - CollectionLength: " + l); LOG.info(" - CollectionDocumentCount: " + n); collectionLength += l; docCount += n; } float avgdl = (float) collectionLength / docCount; LOG.info("all index segments: "); LOG.info(" - CollectionLength: " + collectionLength); LOG.info(" - CollectionDocumentCount: " + docCount); LOG.info(" - AverageDocumentLenght: " + avgdl); RetrievalEnvironment env = new RetrievalEnvironment(dataOutputPath, fs); env.writeCollectionAverageDocumentLength(avgdl); env.writeCollectionLength(collectionLength); env.writeCollectionDocumentCount(docCount); return 0; }
From source file:ivory.core.preprocess.BuildTargetLangWeightedIntDocVectors.java
License:Apache License
@SuppressWarnings("deprecation") public int runTool() throws Exception { // sLogger.setLevel(Level.DEBUG); sLogger.info("PowerTool: GetTargetLangWeightedIntDocVectors"); JobConf conf = new JobConf(BuildTargetLangWeightedIntDocVectors.class); FileSystem fs = FileSystem.get(conf); String indexPath = getConf().get("Ivory.IndexPath"); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); String outputPath = env.getWeightedIntDocVectorsDirectory(); int mapTasks = getConf().getInt("Ivory.NumMapTasks", 0); int minSplitSize = getConf().getInt("Ivory.MinSplitSize", 0); String collectionName = getConf().get("Ivory.CollectionName"); sLogger.info("Characteristics of the collection:"); sLogger.info(" - CollectionName: " + collectionName); sLogger.info("Characteristics of the job:"); sLogger.info(" - NumMapTasks: " + mapTasks); sLogger.info(" - MinSplitSize: " + minSplitSize); String vocabFile = getConf().get("Ivory.FinalVocab"); DistributedCache.addCacheFile(new URI(vocabFile), conf); Path inputPath = new Path(PwsimEnvironment.getFileNameWithPars(indexPath, "TermDocs")); Path weightedVectorsPath = new Path(outputPath); if (fs.exists(weightedVectorsPath)) { sLogger.info("Output path already exists!"); return -1; }/*from w w w . j av a 2 s. co m*/ conf.setJobName("GetWeightedIntDocVectors:" + collectionName); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(0); conf.setInt("mapred.min.split.size", minSplitSize); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setBoolean("Ivory.Normalize", getConf().getBoolean("Ivory.Normalize", false)); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, weightedVectorsPath); conf.setInputFormat(SequenceFileInputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(WeightedIntDocVector.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(WeightedIntDocVector.class); conf.setMapperClass(MyMapper.class); long startTime = System.currentTimeMillis(); RunningJob rj = JobClient.runJob(conf); sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); Counters counters = rj.getCounters(); long numOfDocs = (long) counters.findCounter(Docs.Total).getCounter(); return (int) numOfDocs; }
From source file:ivory.core.preprocess.BuildWeightedIntDocVectors.java
License:Apache License
@SuppressWarnings("deprecation") public int runTool() throws Exception { sLogger.setLevel(Level.WARN); sLogger.info("PowerTool: GetWeightedIntDocVectors"); // create a new JobConf, inheriting from the configuration of this // PowerTool/*from www .ja va 2s . c om*/ JobConf conf = new JobConf(getConf(), BuildWeightedIntDocVectors.class); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get("Ivory.IndexPath"); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); String outputPath = env.getWeightedIntDocVectorsDirectory(); int mapTasks = conf.getInt("Ivory.NumMapTasks", 0); int minSplitSize = conf.getInt("Ivory.MinSplitSize", 0); String collectionName = conf.get("Ivory.CollectionName"); sLogger.info("Characteristics of the collection:"); sLogger.info(" - CollectionName: " + collectionName); sLogger.info("Characteristics of the job:"); sLogger.info(" - NumMapTasks: " + mapTasks); sLogger.info(" - MinSplitSize: " + minSplitSize); String dfByIntFilePath = env.getDfByIntData(); String cfByIntFilePath = env.getCfByIntData(); /* add df table to cache */ if (!fs.exists(new Path(dfByIntFilePath))) { throw new RuntimeException("Error, df data file " + dfByIntFilePath + "doesn't exist!"); } DistributedCache.addCacheFile(new URI(dfByIntFilePath), conf); /* add cf table to cache */ if (!fs.exists(new Path(cfByIntFilePath))) { throw new RuntimeException("Error, cf data file " + cfByIntFilePath + "doesn't exist!"); } DistributedCache.addCacheFile(new URI(cfByIntFilePath), conf); /* add dl table to cache */ Path docLengthFile = env.getDoclengthsData(); if (!fs.exists(docLengthFile)) { throw new RuntimeException("Error, doc-length data file " + docLengthFile + "doesn't exist!"); } DistributedCache.addCacheFile(docLengthFile.toUri(), conf); Path inputPath = new Path(env.getIntDocVectorsDirectory()); Path weightedVectorsPath = new Path(outputPath); if (fs.exists(weightedVectorsPath)) { sLogger.info("Output path already exists!"); return 0; } //fs.delete(weightedVectirsPath, true); conf.setJobName("GetWeightedIntDocVectors:" + collectionName); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(0); conf.setInt("mapred.min.split.size", minSplitSize); conf.set("mapred.child.java.opts", "-Xmx2048m"); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, weightedVectorsPath); conf.setInputFormat(SequenceFileInputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(WeightedIntDocVector.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(WeightedIntDocVector.class); conf.setMapperClass(MyMapper.class); //conf.setInt("mapred.task.timeout",3600000); long startTime = System.currentTimeMillis(); RunningJob job = JobClient.runJob(conf); sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }