List of usage examples for org.apache.hadoop.mapred JobConf setReducerClass
public void setReducerClass(Class<? extends Reducer> theClass)
From source file:it.crs4.pydoop.pipes.Submitter.java
License:Apache License
@Override public int run(String[] args) throws Exception { CommandLineParser cli = new CommandLineParser(); if (args.length == 0) { cli.printUsage();//from w w w . j a va 2 s .c o m return 1; } cli.addOption("input", false, "input path to the maps", "path"); cli.addOption("output", false, "output path from the reduces", "path"); cli.addOption("jar", false, "job jar file", "path"); cli.addOption("inputformat", false, "java classname of InputFormat", "class"); //cli.addArgument("javareader", false, "is the RecordReader in Java"); cli.addOption("map", false, "java classname of Mapper", "class"); cli.addOption("partitioner", false, "java classname of Partitioner", "class"); cli.addOption("reduce", false, "java classname of Reducer", "class"); cli.addOption("writer", false, "java classname of OutputFormat", "class"); cli.addOption("program", false, "URI to application executable", "class"); cli.addOption("reduces", false, "number of reduces", "num"); cli.addOption("jobconf", false, "\"n1=v1,n2=v2,..\" (Deprecated) Optional. Add or override a JobConf property.", "key=val"); cli.addOption("lazyOutput", false, "Optional. Create output lazily", "boolean"); Parser parser = cli.createParser(); try { GenericOptionsParser genericParser = new GenericOptionsParser(getConf(), args); CommandLine results = parser.parse(cli.options, genericParser.getRemainingArgs()); JobConf job = new JobConf(getConf()); if (results.hasOption("input")) { FileInputFormat.setInputPaths(job, results.getOptionValue("input")); } if (results.hasOption("output")) { FileOutputFormat.setOutputPath(job, new Path(results.getOptionValue("output"))); } if (results.hasOption("jar")) { job.setJar(results.getOptionValue("jar")); } if (results.hasOption("inputformat")) { setIsJavaRecordReader(job, true); job.setInputFormat(getClass(results, "inputformat", job, InputFormat.class)); } if (results.hasOption("javareader")) { setIsJavaRecordReader(job, true); } if (results.hasOption("map")) { setIsJavaMapper(job, true); job.setMapperClass(getClass(results, "map", job, Mapper.class)); } if (results.hasOption("partitioner")) { job.setPartitionerClass(getClass(results, "partitioner", job, Partitioner.class)); } if (results.hasOption("reduce")) { setIsJavaReducer(job, true); job.setReducerClass(getClass(results, "reduce", job, Reducer.class)); } if (results.hasOption("reduces")) { job.setNumReduceTasks(Integer.parseInt(results.getOptionValue("reduces"))); } if (results.hasOption("writer")) { setIsJavaRecordWriter(job, true); job.setOutputFormat(getClass(results, "writer", job, OutputFormat.class)); } if (results.hasOption("lazyOutput")) { if (Boolean.parseBoolean(results.getOptionValue("lazyOutput"))) { LazyOutputFormat.setOutputFormatClass(job, job.getOutputFormat().getClass()); } } if (results.hasOption("program")) { setExecutable(job, results.getOptionValue("program")); } if (results.hasOption("jobconf")) { LOG.warn("-jobconf option is deprecated, please use -D instead."); String options = results.getOptionValue("jobconf"); StringTokenizer tokenizer = new StringTokenizer(options, ","); while (tokenizer.hasMoreTokens()) { String keyVal = tokenizer.nextToken().trim(); String[] keyValSplit = keyVal.split("="); job.set(keyValSplit[0], keyValSplit[1]); } } // if they gave us a jar file, include it into the class path String jarFile = job.getJar(); if (jarFile != null) { final URL[] urls = new URL[] { FileSystem.getLocal(job).pathToFile(new Path(jarFile)).toURL() }; //FindBugs complains that creating a URLClassLoader should be //in a doPrivileged() block. ClassLoader loader = AccessController.doPrivileged(new PrivilegedAction<ClassLoader>() { public ClassLoader run() { return new URLClassLoader(urls); } }); job.setClassLoader(loader); } runJob(job); return 0; } catch (ParseException pe) { LOG.info("Error : " + pe); cli.printUsage(); return 1; } }
From source file:it.isislab.sof.core.engine.hadoop.sshclient.utils.simulation.executor.SOF.java
License:Apache License
public static void main(String[] args) { /**/*from ww w . j a v a2 s.co m*/ * aids /home/michele/Scrivania/aids netlogo /home/michele/Scrivania/aids/aids.nlogo /home/michele/Scrivania/aids/input.tmp /home/michele/Scrivania/aids/output /home/michele/Scrivania/aids/output.xml false pepp ciao * */ /* try {//Runtime.getRuntime().exec("rm -r /home/lizard87/Desktop/mason_test/output"); Runtime.getRuntime().exec("rm -r /home/michele/Scrivania/aids/output"); } catch (IOException e) {e.printStackTrace();}*/ if (args.length < 9 || args.length == 11 || args.length == 12 || args.length >= 15) { System.out.println("Usage:"); System.out.println("java -jar SCUD.jar " + "<simulation_name> " + "<simulation_path_home> " + "<simulation_type[mason |netlogo |generic]>" + "<simulation_generic_interpreter_path>" + "<simultion_program_path> " + "<simulation_mapper_input_path> " + "<simulation_mapper_output_path> " + "<simulation_output_domain_xmlfile> " + "<simulation_input_path> " + "<<simulation_rating_path>>" + "<oneshot[one|loop]> " + "<author_name> " + "<simulation_description> " + "<path_interpreter_evaluate_file> " + "<evaluate_file_path>"); System.exit(-1); } Configuration conf = null; JobConf job = null; String AUTHOR = null;/*author name*/ String SIMULATION_NAME = null;/*simulation name*/ String SIMULATION_HOME = null;/*path simulation*/ String SIM_TYPE = null;/*mason, netlogo, generic*/ String SIM_EXECUTABLE_SIMULATION_INTERPRETER_PATH = null; String SIM_EXECUTABLE_SIMULATION_PROGRAM = null; /*executable program *.jar | *.nlogo*/ String SIM_EXECUTION_INPUT_DATA_MAPPER = null;/*input.data path */ String SIM_EXECUTION_OUTPUT_MAPPER = null;/*output loop(i) path*/ String SIM_DESCRIPTION_OUTPUT_XML_DOMAIN = null;/*path of domain file */ String SIM_EXECUTION_INPUT_XML = null;/*execution input path*/ boolean ISLOOP = false;/*false[one] | true[loop]*/ //String DESCRIPTION=null;/*simulations' description*/ String INTERPRETER_REMOTE_PATH_EVALUATION = null;/*remote program bin path for executing EvalFoo*/ String EXECUTABLE_RATING_FILE = null;/*path of rating file*/ String SIM_RATING_PATH = null; // aids /home/michele/Scrivania/aids netlogo /home/michele/Scrivania/aids/aids.nlogo /home/michele/Scrivania/aids/input.tmp /home/michele/Scrivania/aids/output /home/michele/Scrivania/aids/domain.xml /home/michele/Scrivania/aids/input loop pepp ciao /usr/bin/python /home/michele/Scrivania/aids/evaluate.py if (args.length == 13) { SIMULATION_NAME = args[0]; SIMULATION_HOME = args[1]; SIM_TYPE = args[2]; SIM_EXECUTABLE_SIMULATION_PROGRAM = args[3]; SIM_EXECUTION_INPUT_DATA_MAPPER = args[4]; SIM_EXECUTION_OUTPUT_MAPPER = args[5]; SIM_DESCRIPTION_OUTPUT_XML_DOMAIN = args[6]; SIM_EXECUTION_INPUT_XML = args[7]; SIM_RATING_PATH = args[8]; ISLOOP = Boolean.parseBoolean(args[9]); AUTHOR = args[10]; //DESCRIPTION=args[11]; INTERPRETER_REMOTE_PATH_EVALUATION = args[11]; EXECUTABLE_RATING_FILE = args[12]; // System.out.println(DESCRIPTION); //System.out.println(INTERPRETER_REMOTE_PATH_EVALUATION); } else if (args.length == 9) { SIMULATION_NAME = args[0]; SIMULATION_HOME = args[1]; SIM_TYPE = args[2]; SIM_EXECUTABLE_SIMULATION_PROGRAM = args[3]; SIM_EXECUTION_INPUT_DATA_MAPPER = args[4]; SIM_EXECUTION_OUTPUT_MAPPER = args[5]; SIM_DESCRIPTION_OUTPUT_XML_DOMAIN = args[6]; ISLOOP = Boolean.parseBoolean(args[7]); AUTHOR = args[8]; //DESCRIPTION=args[9]; } else if (args.length == 14) { SIMULATION_NAME = args[0]; SIMULATION_HOME = args[1]; SIM_TYPE = args[2]; SIM_EXECUTABLE_SIMULATION_INTERPRETER_PATH = args[3]; SIM_EXECUTABLE_SIMULATION_PROGRAM = args[4]; SIM_EXECUTION_INPUT_DATA_MAPPER = args[5]; SIM_EXECUTION_OUTPUT_MAPPER = args[6]; SIM_DESCRIPTION_OUTPUT_XML_DOMAIN = args[7]; SIM_EXECUTION_INPUT_XML = args[8]; SIM_RATING_PATH = args[9]; ISLOOP = Boolean.parseBoolean(args[10]); AUTHOR = args[11]; // DESCRIPTION=args[12]; INTERPRETER_REMOTE_PATH_EVALUATION = args[12]; EXECUTABLE_RATING_FILE = args[13]; } else if (args.length == 10) { SIMULATION_NAME = args[0]; SIMULATION_HOME = args[1]; SIM_TYPE = args[2]; SIM_EXECUTABLE_SIMULATION_INTERPRETER_PATH = args[3]; SIM_EXECUTABLE_SIMULATION_PROGRAM = args[4]; SIM_EXECUTION_INPUT_DATA_MAPPER = args[5]; SIM_EXECUTION_OUTPUT_MAPPER = args[6]; SIM_DESCRIPTION_OUTPUT_XML_DOMAIN = args[7]; ISLOOP = Boolean.parseBoolean(args[8]); AUTHOR = args[9]; // DESCRIPTION=args[10]; } if (!(SIM_TYPE.equalsIgnoreCase("mason") || SIM_TYPE.equalsIgnoreCase("netlogo") || SIM_TYPE.equalsIgnoreCase("generic"))) { System.exit(-2); } conf = new Configuration(); job = new JobConf(conf, SOF.class); job.setJobName(SIMULATION_NAME/*SIMULATION NAME*/); job.set("simulation.home", SIMULATION_HOME); job.set("simulation.name", SIMULATION_NAME); job.set("simulation.type", SIM_TYPE); if (SIM_TYPE.equalsIgnoreCase("generic")) { job.set("simulation.interpreter.genericsim", SIM_EXECUTABLE_SIMULATION_INTERPRETER_PATH); } job.set("simulation.program.simulation", SIM_EXECUTABLE_SIMULATION_PROGRAM); job.set("simulation.executable.input", SIM_EXECUTION_INPUT_DATA_MAPPER); job.set("simulation.executable.output", SIM_EXECUTION_OUTPUT_MAPPER); job.setBoolean("simulation.executable.mode", ISLOOP); //job.set("simulation.executable.mode", ISLOOP); job.set("simulation.executable.author", AUTHOR); //job.set("simulation.executable.description", DESCRIPTION); job.set("simulation.description.output.domain", SIM_DESCRIPTION_OUTPUT_XML_DOMAIN); /** * GENERA IL .TMP * COMMENTA LA LINEA * TEST IN LOCALE * SOLO PER IL LOCALE */ //XmlToText.convertXmlFileToFileText(conf,"/home/lizard87/Desktop/mason_test/input.xml"); //XmlToText.convertXmlFileToFileText(conf,"/home/lizard87/Desktop/input.xml"); //XmlToText.convertXmlFileToFileText(conf,"/home/lizard87/Desktop/aids/input.xml"); if (ISLOOP) { job.set("simulation.description.input", SIM_EXECUTION_INPUT_XML); job.set("simulation.program.rating", EXECUTABLE_RATING_FILE); //job.set("simulation.interpreter.selection", INTERPRETER_REMOTE_PATH_SELECTION); job.set("simulation.interpreter.rating", INTERPRETER_REMOTE_PATH_EVALUATION); job.set("simulation.executable.loop.rating", SIM_RATING_PATH); } FileInputFormat.addInputPath(job, new Path(SIM_EXECUTION_INPUT_DATA_MAPPER)/*DIRECTORY INPUT*/); FileOutputFormat.setOutputPath(job, new Path(SIM_EXECUTION_OUTPUT_MAPPER)); if (SIM_TYPE.equalsIgnoreCase("mason")) { job.setMapperClass(SOFMapperMason.class); job.setReducerClass(SOFReducerMason.class); } else if (SIM_TYPE.equalsIgnoreCase("netlogo")) { job.setMapperClass(SOFMapperNetLogo.class); job.setReducerClass(SOFReducerNetLogo.class); } else if (SIM_TYPE.equalsIgnoreCase("generic")) { job.setMapperClass(SOFMapperGeneric.class); job.setReducerClass(SOFReducerGeneric.class); } job.setOutputKeyClass(org.apache.hadoop.io.Text.class); job.setOutputValueClass(org.apache.hadoop.io.Text.class); JobClient jobc; try { jobc = new JobClient(job); System.out.println(jobc + " " + job); RunningJob runjob; runjob = JobClient.runJob(job); while (runjob.getJobStatus().equals(JobStatus.SUCCEEDED)) { } System.exit(0); } catch (IOException e) { e.printStackTrace(); } }
From source file:Iterator.SpeciesIterDriver2.java
@SuppressWarnings("deprecation") public static void main(String[] args) { int iterationCount = 0; while (iterationCount <= 20) { System.out.println("Running Iteration - " + iterationCount); JobClient client = new JobClient(); JobConf conf = new JobConf(SpeciesIterDriver2.class); conf.setJobName("Species Iter - " + iterationCount); // This property is set to generate 5 reducer tasks conf.setNumReduceTasks(5);/* w ww . j av a 2 s. c o m*/ conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); //output-iterator-0 contains the input data FileInputFormat.setInputPaths(conf, new Path("output-iterator-" + iterationCount)); iterationCount++; FileOutputFormat.setOutputPath(conf, new Path("output-iterator-" + iterationCount)); conf.setMapperClass(SpeciesIterMapper2.class); conf.setReducerClass(SpeciesIterReducer2.class); conf.setCombinerClass(SpeciesIterReducer2.class); client.setConf(conf); try { JobClient.runJob(conf); } catch (Exception e) { e.printStackTrace(); } } }
From source file:ivory.core.index.MergeGlobalStatsAcrossIndexSegments.java
License:Apache License
public int runTool() throws Exception { JobConf conf = new JobConf(getConf(), MergeGlobalStatsAcrossIndexSegments.class); FileSystem fs = FileSystem.get(conf); String collectionName = conf.get("Ivory.CollectionName"); String indexPaths = conf.get("Ivory.IndexPaths"); String dataOutputPath = conf.get("Ivory.DataOutputPath"); int dfThreshold = conf.getInt("Ivory.DfThreshold", 0); // first, compute size of global term space Path tmpPaths = new Path("/tmp/index-paths.txt"); FSDataOutputStream out = fs.create(tmpPaths, true); for (String s : indexPaths.split(",")) { out.write(new String(s + "\n").getBytes()); }//from w ww . ja va2s .c om out.close(); LOG.info("Job: ComputeNumberOfTermsAcrossIndexSegments"); conf.setJobName("ComputeNumberOfTermsAcrossIndexSegments:" + collectionName); FileInputFormat.addInputPath(conf, tmpPaths); conf.setNumMapTasks(1); conf.setNumReduceTasks(1); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInputFormat(NLineInputFormat.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(PairOfIntLong.class); conf.setOutputFormat(NullOutputFormat.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(IdentityReducer.class); long startTime = System.currentTimeMillis(); RunningJob job = JobClient.runJob(conf); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); Counters counters = job.getCounters(); long totalNumTerms = counters.findCounter("org.apache.hadoop.mapred.Task$Counter", 6, "REDUCE_INPUT_GROUPS") .getCounter(); LOG.info("total number of terms in global dictionary = " + totalNumTerms); // now build the dictionary fs.delete(new Path(dataOutputPath), true); conf = new JobConf(getConf(), MergeGlobalStatsAcrossIndexSegments.class); LOG.info("Job: MergeGlobalStatsAcrossIndexSegments"); conf.setJobName("MergeGlobalStatsAcrossIndexSegments:" + collectionName); FileInputFormat.addInputPath(conf, tmpPaths); conf.setNumMapTasks(1); conf.setNumReduceTasks(1); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInputFormat(NLineInputFormat.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(PairOfIntLong.class); conf.setOutputFormat(NullOutputFormat.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(MyReducer.class); conf.setLong("Ivory.IndexNumberOfTerms", (int) totalNumTerms); startTime = System.currentTimeMillis(); job = JobClient.runJob(conf); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); // compute some # docs, collection length, avg doc length long collectionLength = 0; int docCount = 0; for (String index : indexPaths.split(",")) { LOG.info("reading stats for " + index); RetrievalEnvironment env = new RetrievalEnvironment(index, fs); long l = env.readCollectionLength(); int n = env.readCollectionDocumentCount(); LOG.info(" - CollectionLength: " + l); LOG.info(" - CollectionDocumentCount: " + n); collectionLength += l; docCount += n; } float avgdl = (float) collectionLength / docCount; LOG.info("all index segments: "); LOG.info(" - CollectionLength: " + collectionLength); LOG.info(" - CollectionDocumentCount: " + docCount); LOG.info(" - AverageDocumentLenght: " + avgdl); RetrievalEnvironment env = new RetrievalEnvironment(dataOutputPath, fs); env.writeCollectionAverageDocumentLength(avgdl); env.writeCollectionLength(collectionLength); env.writeCollectionDocumentCount(docCount); return 0; }
From source file:ivory.index.BuildIntPostingsForwardIndex.java
License:Apache License
public int runTool() throws Exception { JobConf conf = new JobConf(getConf(), BuildIntPostingsForwardIndex.class); FileSystem fs = FileSystem.get(conf); int mapTasks = conf.getInt("Ivory.NumMapTasks", 0); int minSplitSize = conf.getInt("Ivory.MinSplitSize", 0); String indexPath = conf.get("Ivory.IndexPath"); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); String collectionName = env.readCollectionName(); sLogger.info("Tool: BuildIntPostingsForwardIndex"); sLogger.info(" - IndexPath: " + indexPath); sLogger.info(" - CollectionName: " + collectionName); conf.setJobName("BuildIntPostingsForwardIndex:" + collectionName); Path inputPath = new Path(env.getPostingsDirectory()); FileInputFormat.setInputPaths(conf, inputPath); Path postingsIndexPath = new Path(env.getPostingsIndexData()); if (fs.exists(postingsIndexPath)) { sLogger.info("Postings forward index path already exists!"); return 0; }/*w ww .ja v a2 s.co m*/ conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(1); conf.setInt("mapred.min.split.size", minSplitSize); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInputFormat(SequenceFileInputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(Text.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setOutputFormat(NullOutputFormat.class); conf.setMapRunnerClass(MyMapRunner.class); conf.setReducerClass(MyReducer.class); JobClient.runJob(conf); return 0; }
From source file:ivory.index.BuildIPInvertedIndexDocSorted.java
License:Apache License
@SuppressWarnings("unused") public int runTool() throws Exception { JobConf conf = new JobConf(getConf(), BuildIPInvertedIndexDocSorted.class); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get("Ivory.IndexPath"); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); String collectionName = env.readCollectionName(); int mapTasks = conf.getInt("Ivory.NumMapTasks", 0); int reduceTasks = conf.getInt("Ivory.NumReduceTasks", 0); int minSplitSize = conf.getInt("Ivory.MinSplitSize", 0); int collectionDocCnt = env.readCollectionDocumentCount(); LOG.info("PowerTool: BuildIPInvertedIndexDocSorted"); LOG.info(" - IndexPath: " + indexPath); LOG.info(" - CollectionName: " + collectionName); LOG.info(" - CollectionDocumentCount: " + collectionDocCnt); LOG.info(" - NumMapTasks: " + mapTasks); LOG.info(" - NumReduceTasks: " + reduceTasks); LOG.info(" - MinSplitSize: " + minSplitSize); if (!fs.exists(new Path(indexPath))) { fs.mkdirs(new Path(indexPath)); }//from ww w . ja v a 2 s . co m Path inputPath = new Path(env.getIntDocVectorsDirectory()); Path postingsPath = new Path(env.getPostingsDirectory()); if (fs.exists(postingsPath)) { LOG.info("Postings already exist: no indexing will be performed."); return 0; } conf.setJobName("BuildIPInvertedIndex:" + collectionName); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(reduceTasks); conf.setInt("Ivory.CollectionDocumentCount", collectionDocCnt); conf.setInt("mapred.min.split.size", minSplitSize); conf.set("mapred.child.java.opts", "-Xmx2048m"); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, postingsPath); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(PairOfInts.class); conf.setMapOutputValueClass(TermPositions.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(PostingsListDocSortedPositional.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(MyReducer.class); conf.setPartitionerClass(MyPartitioner.class); long startTime = System.currentTimeMillis(); RunningJob job = JobClient.runJob(conf); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); env.writePostingsType("ivory.data.PostingsListDocSortedPositional"); return 0; }
From source file:ivory.preprocess.BuildIntDocVectorsForwardIndex.java
License:Apache License
public int runTool() throws Exception { JobConf conf = new JobConf(getConf(), BuildIntDocVectorsForwardIndex.class); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get("Ivory.IndexPath"); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); int mapTasks = conf.getInt("Ivory.NumMapTasks", 0); String collectionName = env.readCollectionName(); boolean buildWeighted = conf.getBoolean("Ivory.BuildWeighted", false); sLogger.info("Tool: BuildIntDocVectorsIndex"); sLogger.info(" - IndexPath: " + indexPath); sLogger.info(" - CollectionName: " + collectionName); sLogger.info(" - BuildWeighted: " + buildWeighted); sLogger.info(" - NumMapTasks: " + mapTasks); String intDocVectorsPath;//from w w w .j ava 2 s . co m String forwardIndexPath; if (buildWeighted) { intDocVectorsPath = env.getWeightedIntDocVectorsDirectory(); forwardIndexPath = env.getWeightedIntDocVectorsForwardIndex(); } else { intDocVectorsPath = env.getIntDocVectorsDirectory(); forwardIndexPath = env.getIntDocVectorsForwardIndex(); } if (!fs.exists(new Path(intDocVectorsPath))) { sLogger.info("Error: IntDocVectors don't exist!"); return 0; } if (fs.exists(new Path(forwardIndexPath))) { sLogger.info("IntDocVectorIndex already exists: skipping!"); return 0; } conf.setJobName("BuildIntDocVectorsForwardIndex:" + collectionName); Path inputPath = new Path(intDocVectorsPath); FileInputFormat.setInputPaths(conf, inputPath); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(1); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInputFormat(SequenceFileInputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(Text.class); conf.setOutputFormat(NullOutputFormat.class); conf.setMapRunnerClass(MyMapRunner.class); conf.setReducerClass(MyReducer.class); JobClient.runJob(conf); return 0; }
From source file:ivory.preprocess.BuildTermDocVectorsForwardIndex.java
License:Apache License
public int runTool() throws Exception { JobConf conf = new JobConf(getConf(), BuildTermDocVectorsForwardIndex.class); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get("Ivory.IndexPath"); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); int mapTasks = conf.getInt("Ivory.NumMapTasks", 0); String collectionName = env.readCollectionName(); sLogger.info("Tool: BuildTermDocVectorsIndex"); sLogger.info(" - IndexPath: " + indexPath); sLogger.info(" - CollectionName: " + collectionName); sLogger.info(" - NumMapTasks: " + mapTasks); if (!fs.exists(new Path(env.getTermDocVectorsDirectory()))) { sLogger.info("Error: TermDocVectors don't exist!"); return 0; }//from w w w. j av a 2 s.co m if (fs.exists(new Path(env.getTermDocVectorsForwardIndex()))) { sLogger.info("TermDocVectorIndex already exists: skipping!"); return 0; } conf.setJobName("BuildTermDocVectorsForwardIndex:" + collectionName); Path inputPath = new Path(env.getTermDocVectorsDirectory()); FileInputFormat.setInputPaths(conf, inputPath); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(1); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInputFormat(SequenceFileInputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(Text.class); conf.setOutputFormat(NullOutputFormat.class); conf.setMapRunnerClass(MyMapRunner.class); conf.setReducerClass(MyReducer.class); JobClient.runJob(conf); return 0; }
From source file:ivory.preprocess.BuildTermIdMap.java
License:Apache License
@SuppressWarnings("unused") public int runTool() throws Exception { // create a new JobConf, inheriting from the configuration of this // PowerTool/*ww w . j av a 2s. c o m*/ JobConf conf = new JobConf(getConf(), BuildTermIdMap.class); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get("Ivory.IndexPath"); String collectionName = conf.get("Ivory.CollectionName"); int mapTasks = conf.getInt("Ivory.NumMapTasks", 0); int reduceTasks = 1; int minSplitSize = conf.getInt("Ivory.MinSplitSize", 0); sLogger.info("PowerTool: BuildTermIdMap"); sLogger.info(" - CollectionName: " + collectionName); sLogger.info(" - IndexPath: " + indexPath); sLogger.info(" - NumMapTasks: " + mapTasks); sLogger.info(" - NumReduceTasks: " + reduceTasks); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); if (!fs.exists(new Path(indexPath))) { sLogger.error("index path doesn't existing: skipping!"); return 0; } Path termsFilePath = new Path(env.getIndexTermsData()); Path termIDsFilePath = new Path(env.getIndexTermIdsData()); Path idToTermFilePath = new Path(env.getIndexTermIdMappingData()); Path dfByTermFilePath = new Path(env.getDfByTermData()); Path cfByTermFilePath = new Path(env.getCfByTermData()); Path dfByIntFilePath = new Path(env.getDfByIntData()); Path cfByIntFilePath = new Path(env.getCfByIntData()); if (fs.exists(termsFilePath) || fs.exists(termIDsFilePath) || fs.exists(idToTermFilePath) || fs.exists(dfByTermFilePath) || fs.exists(cfByTermFilePath) || fs.exists(dfByIntFilePath) || fs.exists(cfByIntFilePath)) { sLogger.info("term and term id data exist: skipping!"); return 0; } Path tmpPath = new Path(env.getTempDirectory()); fs.delete(tmpPath, true); conf.setJobName("BuildTermIdMap:" + collectionName); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(reduceTasks); conf.setInt("Ivory.CollectionTermCount", (int) env.readCollectionTermCount()); conf.setInt("mapred.min.split.size", minSplitSize); conf.set("mapred.child.java.opts", "-Xmx2048m"); FileInputFormat.setInputPaths(conf, new Path(env.getTermDfCfDirectory())); FileOutputFormat.setOutputPath(conf, tmpPath); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(PairOfIntLong.class); conf.setOutputKeyClass(Text.class); conf.setMapperClass(IdentityMapper.class); conf.setReducerClass(MyReducer.class); long startTime = System.currentTimeMillis(); RunningJob job = JobClient.runJob(conf); sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); fs.delete(tmpPath, true); return 0; }
From source file:ivory.preprocess.GetTermCount.java
License:Apache License
public int runTool() throws Exception { // create a new JobConf, inheriting from the configuration of this // PowerTool//from w ww. ja va 2s . c o m JobConf conf = new JobConf(getConf(), GetTermCount.class); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get(Constants.IndexPath); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); int mapTasks = conf.getInt(Constants.NumMapTasks, 0); int reduceTasks = conf.getInt(Constants.NumReduceTasks, 0); String collectionName = env.readCollectionName(); String termDocVectorsPath = env.getTermDocVectorsDirectory(); String termDfCfPath = env.getTermDfCfDirectory(); if (!fs.exists(new Path(indexPath))) { sLogger.info("index path doesn't existing: skipping!"); return 0; } sLogger.info("PowerTool: GetTermCount"); sLogger.info(" - CollectionName: " + collectionName); sLogger.info(" - NumMapTasks: " + mapTasks); sLogger.info(" - NumReduceTasks: " + reduceTasks); sLogger.info(" - MinDf: " + conf.getInt(Constants.MinDf, 0)); sLogger.info(" - MaxDf: " + conf.getInt(Constants.MaxDf, Integer.MAX_VALUE)); Path outputPath = new Path(termDfCfPath); if (fs.exists(outputPath)) { sLogger.error("TermDfCf directory exist: skipping!"); return 0; } conf.setJobName("GetTermCount:" + collectionName); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(reduceTasks); conf.set("mapred.child.java.opts", "-Xmx2048m"); FileInputFormat.setInputPaths(conf, new Path(termDocVectorsPath)); FileOutputFormat.setOutputPath(conf, outputPath); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(PairOfIntLong.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(PairOfIntLong.class); conf.setMapperClass(MyMapper.class); conf.setCombinerClass(MyCombiner.class); conf.setReducerClass(MyReducer.class); long startTime = System.currentTimeMillis(); RunningJob job = JobClient.runJob(conf); sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); Counters counters = job.getCounters(); // write out number of postings int collectionTermCount = (int) counters.findCounter(Statistics.Terms).getCounter(); env.writeCollectionTermCount(collectionTermCount); // NOTE: this value is not the same as number of postings, because // postings for non-English terms are discarded, or as result of df cut long collectionLength = counters.findCounter(Statistics.SumOfDocLengths).getCounter(); env.writeCollectionLength(collectionLength); return 0; }