List of usage examples for org.apache.hadoop.mapred JobConf setPartitionerClass
public void setPartitionerClass(Class<? extends Partitioner> theClass)
From source file:net.peacesoft.nutch.crawl.ReGenerator.java
License:Apache License
/** * Generate fetchlists in one or more segments. Whether to filter URLs or * not is read from the crawl.generate.filter property in the configuration * files. If the property is not found, the URLs are filtered. Same for the * normalisation.//from ww w . jav a 2s. c o m * * @param dbDir Crawl database directory * @param segments Segments directory * @param numLists Number of reduce tasks * @param topN Number of top URLs to be selected * @param curTime Current time in milliseconds * * @return Path to generated segment or null if no entries were selected * * @throws IOException When an I/O error occurs */ public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter, boolean norm, boolean force, int maxNumSegments) throws IOException { try { Path tempDir = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); Path lock = new Path(dbDir, CrawlDb.LOCK_NAME); FileSystem fs = FileSystem.get(getConf()); LockUtil.createLockFile(fs, lock, force); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("ReGenerator: starting at " + sdf.format(start)); LOG.info("ReGenerator: Selecting best-scoring urls due for fetch."); LOG.info("ReGenerator: filtering: " + filter); LOG.info("ReGenerator: normalizing: " + norm); if (topN != Long.MAX_VALUE) { LOG.info("ReGenerator: topN: " + topN); } if ("true".equals(getConf().get(GENERATE_MAX_PER_HOST_BY_IP))) { LOG.info( "ReGenerator: GENERATE_MAX_PER_HOST_BY_IP will be ignored, use partition.url.mode instead"); } // map to inverted subset due for fetch, sort by score JobConf job = new NutchJob(getConf()); job.setJobName("generate: select from " + dbDir); if (numLists == -1) { // for politeness make numLists = job.getNumMapTasks(); // a partition per fetch task } if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) { // override LOG.info("ReGenerator: jobtracker is 'local', generating exactly one partition."); numLists = 1; } job.setLong(GENERATOR_CUR_TIME, curTime); // record real generation time long generateTime = System.currentTimeMillis(); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); job.setLong(GENERATOR_TOP_N, topN); job.setBoolean(GENERATOR_FILTER, filter); job.setBoolean(GENERATOR_NORMALISE, norm); job.setInt(GENERATOR_MAX_NUM_SEGMENTS, maxNumSegments); FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(Selector.class); job.setPartitionerClass(Selector.class); job.setReducerClass(Selector.class); FileOutputFormat.setOutputPath(job, tempDir); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(FloatWritable.class); job.setOutputKeyComparatorClass(DecreasingFloatComparator.class); job.setOutputValueClass(SelectorEntry.class); job.setOutputFormat(GeneratorOutputFormat.class); try { JobClient.runJob(job); } catch (IOException e) { throw e; } // read the subdirectories generated in the temp // output and turn them into segments List<Path> generatedSegments = new ArrayList<Path>(); FileStatus[] status = fs.listStatus(tempDir); try { for (FileStatus stat : status) { Path subfetchlist = stat.getPath(); if (!subfetchlist.getName().startsWith("fetchlist-")) { continue; } // start a new partition job for this segment Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists); generatedSegments.add(newSeg); } } catch (Exception e) { LOG.warn("ReGenerator: exception while partitioning segments, exiting ..."); fs.delete(tempDir, true); return null; } if (generatedSegments.size() == 0) { LOG.warn("ReGenerator: 0 records selected for fetching, exiting ..."); LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); return null; } if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) { // update the db from tempDir Path tempDir2 = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); job = new NutchJob(getConf()); job.setJobName("generate: updatedb " + dbDir); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); for (Path segmpaths : generatedSegments) { Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME); FileInputFormat.addInputPath(job, subGenDir); } FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(CrawlDbUpdater.class); job.setReducerClass(CrawlDbUpdater.class); job.setOutputFormat(MapFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); FileOutputFormat.setOutputPath(job, tempDir2); try { JobClient.runJob(job); CrawlDb.install(job, dbDir); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); fs.delete(tempDir2, true); throw e; } fs.delete(tempDir2, true); } LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); long end = System.currentTimeMillis(); LOG.info("ReGenerator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); Path[] patharray = new Path[generatedSegments.size()]; return generatedSegments.toArray(patharray); } catch (Exception ex) { LOG.error("ReGenerator generate error: " + ex.toString(), ex); return null; } }
From source file:org.acacia.csr.java.CSRConverter.java
License:Apache License
public static void main(String[] args) throws Exception { if (!validArgs(args)) { printUsage();//from ww w . j av a2 s . co m return; } //These are the temp paths that are created on HDFS String dir1 = "/user/miyuru/csrconverter-output"; String dir2 = "/user/miyuru/csrconverter-output-sorted"; //We first delete the temporary directories if they exist on the HDFS FileSystem fs1 = FileSystem.get(new JobConf()); System.out.println("Deleting the dir : " + dir1); if (fs1.exists(new Path(dir1))) { fs1.delete(new Path(dir1), true); } System.out.println("Done deleting the dir : " + dir1); System.out.println("Deleting the dir : " + dir2); if (fs1.exists(new Path(dir2))) { fs1.delete(new Path(dir2), true); } Path notinPath = new Path("/user/miyuru/notinverts/notinverts"); if (!fs1.exists(notinPath)) { fs1.create(notinPath); } System.out.println("Done deleting the dir : " + dir2); //Note on Aug 23 2014: Sometimes after this the mapReduce job hangs. need to see why. VertexCounterClient.setDefaultGraphID(args[3], args[2]); //First job creates the inverted index JobConf conf = new JobConf(CSRConverter.class); conf.set("org.acacia.partitioner.hbase.zookeeper.quorum", args[1]); conf.set("org.acacia.partitioner.hbase.table", args[2]); conf.set("org.acacia.partitioner.hbase.contacthost", args[3]); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(Text.class); //conf.setMapperClass(InvertedMapper.class); conf.setReducerClass(InvertedReducer.class); //conf.setInputFormat(TextInputFormat.class); conf.setInputFormat(NLinesInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); //FileInputFormat.setInputPaths(conf, new Path(args[0])); MultipleInputs.addInputPath(conf, new Path(args[0]), NLinesInputFormat.class, InvertedMapper.class); MultipleInputs.addInputPath(conf, new Path("/user/miyuru/notinverts/notinverts"), TextInputFormat.class, InvertedMapper.class); FileOutputFormat.setOutputPath(conf, new Path(dir1)); //Also for the moment we turn-off the speculative execution conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); conf.setNumMapTasks(96); conf.setNumReduceTasks(96); conf.setPartitionerClass(VertexPartitioner.class); conf.set("vertex-count", args[4]); conf.set("zero-flag", args[5]); Job job = new Job(conf, "csr_inverter"); job.setSortComparatorClass(SortComparator.class); job.waitForCompletion(true); }
From source file:org.apache.avro.mapred.tether.TetherJob.java
License:Apache License
private static void setupTetherJob(JobConf job) throws IOException { job.setMapRunnerClass(TetherMapRunner.class); job.setPartitionerClass(TetherPartitioner.class); job.setReducerClass(TetherReducer.class); job.setInputFormat(TetherInputFormat.class); job.setOutputFormat(TetherOutputFormat.class); job.setOutputKeyClass(TetherData.class); job.setOutputKeyComparatorClass(TetherKeyComparator.class); job.setMapOutputValueClass(NullWritable.class); // set the map output key class to TetherData job.setMapOutputKeyClass(TetherData.class); // add TetherKeySerialization to io.serializations Collection<String> serializations = job.getStringCollection("io.serializations"); if (!serializations.contains(TetherKeySerialization.class.getName())) { serializations.add(TetherKeySerialization.class.getName()); job.setStrings("io.serializations", serializations.toArray(new String[0])); }// w ww . j a va2 s. c o m // determine whether the executable should be added to the cache. if (job.getBoolean(TETHER_EXEC_CACHED, false)) { DistributedCache.addCacheFile(getExecutable(job), job); } }
From source file:org.apache.ignite.internal.processors.hadoop.GridHadoopMapReduceEmbeddedSelfTest.java
License:Apache License
/** * Tests whole job execution with all phases in old and new versions of API with definition of custom * Serialization, Partitioner and IO formats. * @throws Exception If fails./* w w w . j a v a2 s. c o m*/ */ public void testMultiReducerWholeMapReduceExecution() throws Exception { IgfsPath inDir = new IgfsPath(PATH_INPUT); igfs.mkdirs(inDir); IgfsPath inFile = new IgfsPath(inDir, GridHadoopWordCount2.class.getSimpleName() + "-input"); generateTestFile(inFile.toString(), "key1", 10000, "key2", 20000, "key3", 15000, "key4", 7000, "key5", 12000, "key6", 18000); for (int i = 0; i < 2; i++) { boolean useNewAPI = i == 1; igfs.delete(new IgfsPath(PATH_OUTPUT), true); flags.put("serializationWasConfigured", false); flags.put("partitionerWasConfigured", false); flags.put("inputFormatWasConfigured", false); flags.put("outputFormatWasConfigured", false); JobConf jobConf = new JobConf(); jobConf.set(CommonConfigurationKeys.IO_SERIALIZATIONS_KEY, CustomSerialization.class.getName()); //To split into about 6-7 items for v2 jobConf.setInt(FileInputFormat.SPLIT_MAXSIZE, 65000); //For v1 jobConf.setInt("fs.local.block.size", 65000); // File system coordinates. setupFileSystems(jobConf); GridHadoopWordCount1.setTasksClasses(jobConf, !useNewAPI, !useNewAPI, !useNewAPI); if (!useNewAPI) { jobConf.setPartitionerClass(CustomV1Partitioner.class); jobConf.setInputFormat(CustomV1InputFormat.class); jobConf.setOutputFormat(CustomV1OutputFormat.class); } Job job = Job.getInstance(jobConf); GridHadoopWordCount2.setTasksClasses(job, useNewAPI, useNewAPI, useNewAPI); if (useNewAPI) { job.setPartitionerClass(CustomV2Partitioner.class); job.setInputFormatClass(CustomV2InputFormat.class); job.setOutputFormatClass(CustomV2OutputFormat.class); } job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.setInputPaths(job, new Path(igfsScheme() + inFile.toString())); FileOutputFormat.setOutputPath(job, new Path(igfsScheme() + PATH_OUTPUT)); job.setNumReduceTasks(3); job.setJarByClass(GridHadoopWordCount2.class); IgniteInternalFuture<?> fut = grid(0).hadoop().submit(new GridHadoopJobId(UUID.randomUUID(), 1), createJobInfo(job.getConfiguration())); fut.get(); assertTrue("Serialization was configured (new API is " + useNewAPI + ")", flags.get("serializationWasConfigured")); assertTrue("Partitioner was configured (new API is = " + useNewAPI + ")", flags.get("partitionerWasConfigured")); assertTrue("Input format was configured (new API is = " + useNewAPI + ")", flags.get("inputFormatWasConfigured")); assertTrue("Output format was configured (new API is = " + useNewAPI + ")", flags.get("outputFormatWasConfigured")); assertEquals("Use new API = " + useNewAPI, "key3\t15000\n" + "key6\t18000\n", readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00000")); assertEquals("Use new API = " + useNewAPI, "key1\t10000\n" + "key4\t7000\n", readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00001")); assertEquals("Use new API = " + useNewAPI, "key2\t20000\n" + "key5\t12000\n", readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00002")); } }
From source file:org.apache.ignite.internal.processors.hadoop.HadoopMapReduceEmbeddedSelfTest.java
License:Apache License
/** * Tests whole job execution with all phases in old and new versions of API with definition of custom * Serialization, Partitioner and IO formats. * @throws Exception If fails./*from w ww . j a va 2 s .com*/ */ public void testMultiReducerWholeMapReduceExecution() throws Exception { IgfsPath inDir = new IgfsPath(PATH_INPUT); igfs.mkdirs(inDir); IgfsPath inFile = new IgfsPath(inDir, HadoopWordCount2.class.getSimpleName() + "-input"); generateTestFile(inFile.toString(), "key1", 10000, "key2", 20000, "key3", 15000, "key4", 7000, "key5", 12000, "key6", 18000); for (int i = 0; i < 2; i++) { boolean useNewAPI = i == 1; igfs.delete(new IgfsPath(PATH_OUTPUT), true); flags.put("serializationWasConfigured", false); flags.put("partitionerWasConfigured", false); flags.put("inputFormatWasConfigured", false); flags.put("outputFormatWasConfigured", false); JobConf jobConf = new JobConf(); jobConf.set(CommonConfigurationKeys.IO_SERIALIZATIONS_KEY, CustomSerialization.class.getName()); //To split into about 6-7 items for v2 jobConf.setInt(FileInputFormat.SPLIT_MAXSIZE, 65000); //For v1 jobConf.setInt("fs.local.block.size", 65000); // File system coordinates. setupFileSystems(jobConf); HadoopWordCount1.setTasksClasses(jobConf, !useNewAPI, !useNewAPI, !useNewAPI); if (!useNewAPI) { jobConf.setPartitionerClass(CustomV1Partitioner.class); jobConf.setInputFormat(CustomV1InputFormat.class); jobConf.setOutputFormat(CustomV1OutputFormat.class); } Job job = Job.getInstance(jobConf); HadoopWordCount2.setTasksClasses(job, useNewAPI, useNewAPI, useNewAPI); if (useNewAPI) { job.setPartitionerClass(CustomV2Partitioner.class); job.setInputFormatClass(CustomV2InputFormat.class); job.setOutputFormatClass(CustomV2OutputFormat.class); } job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.setInputPaths(job, new Path(igfsScheme() + inFile.toString())); FileOutputFormat.setOutputPath(job, new Path(igfsScheme() + PATH_OUTPUT)); job.setNumReduceTasks(3); job.setJarByClass(HadoopWordCount2.class); IgniteInternalFuture<?> fut = grid(0).hadoop().submit(new HadoopJobId(UUID.randomUUID(), 1), createJobInfo(job.getConfiguration())); fut.get(); assertTrue("Serialization was configured (new API is " + useNewAPI + ")", flags.get("serializationWasConfigured")); assertTrue("Partitioner was configured (new API is = " + useNewAPI + ")", flags.get("partitionerWasConfigured")); assertTrue("Input format was configured (new API is = " + useNewAPI + ")", flags.get("inputFormatWasConfigured")); assertTrue("Output format was configured (new API is = " + useNewAPI + ")", flags.get("outputFormatWasConfigured")); assertEquals("Use new API = " + useNewAPI, "key3\t15000\n" + "key6\t18000\n", readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00000")); assertEquals("Use new API = " + useNewAPI, "key1\t10000\n" + "key4\t7000\n", readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00001")); assertEquals("Use new API = " + useNewAPI, "key2\t20000\n" + "key5\t12000\n", readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00002")); } }
From source file:org.apache.ignite.internal.processors.hadoop.impl.HadoopMapReduceEmbeddedSelfTest.java
License:Apache License
/** * Tests whole job execution with all phases in old and new versions of API with definition of custom * Serialization, Partitioner and IO formats. * * @param striped Whether output should be striped or not. * @throws Exception If fails.//from w w w . j av a 2 s .c o m */ public void checkMultiReducerWholeMapReduceExecution(boolean striped) throws Exception { IgfsPath inDir = new IgfsPath(PATH_INPUT); igfs.mkdirs(inDir); IgfsPath inFile = new IgfsPath(inDir, HadoopWordCount2.class.getSimpleName() + "-input"); generateTestFile(inFile.toString(), "key1", 10000, "key2", 20000, "key3", 15000, "key4", 7000, "key5", 12000, "key6", 18000); for (int i = 0; i < 2; i++) { boolean useNewAPI = i == 1; igfs.delete(new IgfsPath(PATH_OUTPUT), true); flags.put("serializationWasConfigured", false); flags.put("partitionerWasConfigured", false); flags.put("inputFormatWasConfigured", false); flags.put("outputFormatWasConfigured", false); JobConf jobConf = new JobConf(); if (striped) jobConf.set(HadoopJobProperty.SHUFFLE_MAPPER_STRIPED_OUTPUT.propertyName(), "true"); else jobConf.set(HadoopJobProperty.SHUFFLE_MAPPER_STRIPED_OUTPUT.propertyName(), "false"); jobConf.set(CommonConfigurationKeys.IO_SERIALIZATIONS_KEY, CustomSerialization.class.getName()); //To split into about 6-7 items for v2 jobConf.setInt(FileInputFormat.SPLIT_MAXSIZE, 65000); //For v1 jobConf.setInt("fs.local.block.size", 65000); // File system coordinates. setupFileSystems(jobConf); HadoopWordCount1.setTasksClasses(jobConf, !useNewAPI, !useNewAPI, !useNewAPI); if (!useNewAPI) { jobConf.setPartitionerClass(CustomV1Partitioner.class); jobConf.setInputFormat(CustomV1InputFormat.class); jobConf.setOutputFormat(CustomV1OutputFormat.class); } Job job = Job.getInstance(jobConf); HadoopWordCount2.setTasksClasses(job, useNewAPI, useNewAPI, useNewAPI, false); if (useNewAPI) { job.setPartitionerClass(CustomV2Partitioner.class); job.setInputFormatClass(CustomV2InputFormat.class); job.setOutputFormatClass(CustomV2OutputFormat.class); } job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.setInputPaths(job, new Path(igfsScheme() + inFile.toString())); FileOutputFormat.setOutputPath(job, new Path(igfsScheme() + PATH_OUTPUT)); job.setNumReduceTasks(3); job.setJarByClass(HadoopWordCount2.class); IgniteInternalFuture<?> fut = grid(0).hadoop().submit(new HadoopJobId(UUID.randomUUID(), 1), createJobInfo(job.getConfiguration())); fut.get(); assertTrue("Serialization was configured (new API is " + useNewAPI + ")", flags.get("serializationWasConfigured")); assertTrue("Partitioner was configured (new API is = " + useNewAPI + ")", flags.get("partitionerWasConfigured")); assertTrue("Input format was configured (new API is = " + useNewAPI + ")", flags.get("inputFormatWasConfigured")); assertTrue("Output format was configured (new API is = " + useNewAPI + ")", flags.get("outputFormatWasConfigured")); assertEquals("Use new API = " + useNewAPI, "key3\t15000\n" + "key6\t18000\n", readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00000")); assertEquals("Use new API = " + useNewAPI, "key1\t10000\n" + "key4\t7000\n", readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00001")); assertEquals("Use new API = " + useNewAPI, "key2\t20000\n" + "key5\t12000\n", readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00002")); } }
From source file:org.apache.mahout.classifier.bayes.mapreduce.common.BayesFeatureDriver.java
License:Apache License
@Override public void runJob(Path input, Path output, BayesParameters params) throws IOException { Configurable client = new JobClient(); JobConf conf = new JobConf(BayesFeatureDriver.class); conf.setJobName("Bayes Feature Driver running over input: " + input); conf.setOutputKeyClass(StringTuple.class); conf.setOutputValueClass(DoubleWritable.class); conf.setPartitionerClass(FeaturePartitioner.class); conf.setOutputKeyComparatorClass(FeatureLabelComparator.class); FileInputFormat.setInputPaths(conf, input); FileOutputFormat.setOutputPath(conf, output); conf.setMapperClass(BayesFeatureMapper.class); conf.setInputFormat(KeyValueTextInputFormat.class); conf.setCombinerClass(BayesFeatureCombiner.class); conf.setReducerClass(BayesFeatureReducer.class); conf.setOutputFormat(BayesFeatureOutputFormat.class); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization"); // this conf parameter needs to be set enable serialisation of conf values HadoopUtil.overwriteOutput(output);//from w ww . ja v a 2s . co m conf.set("bayes.parameters", params.toString()); client.setConf(conf); JobClient.runJob(conf); }
From source file:org.apache.nutch.crawl.Generator.java
License:Apache License
/** * Generate fetchlists in one or more segments. Whether to filter URLs or not * is read from the crawl.generate.filter property in the configuration files. * If the property is not found, the URLs are filtered. Same for the * normalisation./*w w w .j ava 2 s. co m*/ * * @param dbDir * Crawl database directory * @param segments * Segments directory * @param numLists * Number of reduce tasks * @param topN * Number of top URLs to be selected * @param curTime * Current time in milliseconds * * @return Path to generated segment or null if no entries were selected * * @throws IOException * When an I/O error occurs */ public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter, boolean norm, boolean force, int maxNumSegments) throws IOException { Path tempDir = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); Path lock = new Path(dbDir, CrawlDb.LOCK_NAME); FileSystem fs = FileSystem.get(getConf()); LockUtil.createLockFile(fs, lock, force); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("Generator: starting at " + sdf.format(start)); LOG.info("Generator: Selecting best-scoring urls due for fetch."); LOG.info("Generator: filtering: " + filter); LOG.info("Generator: normalizing: " + norm); if (topN != Long.MAX_VALUE) { LOG.info("Generator: topN: " + topN); } if ("true".equals(getConf().get(GENERATE_MAX_PER_HOST_BY_IP))) { LOG.info("Generator: GENERATE_MAX_PER_HOST_BY_IP will be ignored, use partition.url.mode instead"); } // map to inverted subset due for fetch, sort by score JobConf job = new NutchJob(getConf()); job.setJobName("generate: select from " + dbDir); if (numLists == -1) { // for politeness make numLists = job.getNumMapTasks(); // a partition per fetch task } if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) { // override LOG.info("Generator: jobtracker is 'local', generating exactly one partition."); numLists = 1; } job.setLong(GENERATOR_CUR_TIME, curTime); // record real generation time long generateTime = System.currentTimeMillis(); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); job.setLong(GENERATOR_TOP_N, topN); job.setBoolean(GENERATOR_FILTER, filter); job.setBoolean(GENERATOR_NORMALISE, norm); job.setInt(GENERATOR_MAX_NUM_SEGMENTS, maxNumSegments); FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(Selector.class); job.setPartitionerClass(Selector.class); job.setReducerClass(Selector.class); FileOutputFormat.setOutputPath(job, tempDir); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(FloatWritable.class); job.setOutputKeyComparatorClass(DecreasingFloatComparator.class); job.setOutputValueClass(SelectorEntry.class); job.setOutputFormat(GeneratorOutputFormat.class); try { JobClient.runJob(job); } catch (IOException e) { throw e; } // read the subdirectories generated in the temp // output and turn them into segments List<Path> generatedSegments = new ArrayList<Path>(); FileStatus[] status = fs.listStatus(tempDir); try { for (FileStatus stat : status) { Path subfetchlist = stat.getPath(); if (!subfetchlist.getName().startsWith("fetchlist-")) continue; // start a new partition job for this segment Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists); generatedSegments.add(newSeg); } } catch (Exception e) { LOG.warn("Generator: exception while partitioning segments, exiting ..."); fs.delete(tempDir, true); return null; } if (generatedSegments.size() == 0) { LOG.warn("Generator: 0 records selected for fetching, exiting ..."); LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); return null; } if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) { // update the db from tempDir Path tempDir2 = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); job = new NutchJob(getConf()); job.setJobName("generate: updatedb " + dbDir); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); for (Path segmpaths : generatedSegments) { Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME); FileInputFormat.addInputPath(job, subGenDir); } FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(CrawlDbUpdater.class); job.setReducerClass(CrawlDbUpdater.class); job.setOutputFormat(MapFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); FileOutputFormat.setOutputPath(job, tempDir2); try { JobClient.runJob(job); CrawlDb.install(job, dbDir); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); fs.delete(tempDir2, true); throw e; } fs.delete(tempDir2, true); } LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); long end = System.currentTimeMillis(); LOG.info("Generator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); Path[] patharray = new Path[generatedSegments.size()]; return generatedSegments.toArray(patharray); }
From source file:org.apache.nutch.indexer.DeleteDuplicates.java
License:Apache License
public void dedup(Path[] indexDirs) throws IOException { if (LOG.isInfoEnabled()) { LOG.info("Dedup: starting"); }//from w w w .j a v a 2s. c om Path outDir1 = new Path("dedup-urls-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf job = new NutchJob(getConf()); for (int i = 0; i < indexDirs.length; i++) { if (LOG.isInfoEnabled()) { LOG.info("Dedup: adding indexes in: " + indexDirs[i]); } job.addInputPath(indexDirs[i]); } job.setJobName("dedup 1: urls by time"); job.setInputFormat(InputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IndexDoc.class); job.setReducerClass(UrlsReducer.class); job.setOutputPath(outDir1); job.setOutputKeyClass(MD5Hash.class); job.setOutputValueClass(IndexDoc.class); job.setOutputFormat(SequenceFileOutputFormat.class); JobClient.runJob(job); Path outDir2 = new Path("dedup-hash-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); job = new NutchJob(getConf()); job.setJobName("dedup 2: content by hash"); job.addInputPath(outDir1); job.setInputFormat(SequenceFileInputFormat.class); job.setMapOutputKeyClass(MD5Hash.class); job.setMapOutputValueClass(IndexDoc.class); job.setPartitionerClass(HashPartitioner.class); job.setSpeculativeExecution(false); job.setReducerClass(HashReducer.class); job.setOutputPath(outDir2); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IndexDoc.class); job.setOutputFormat(SequenceFileOutputFormat.class); JobClient.runJob(job); // remove outDir1 - no longer needed fs.delete(outDir1); job = new NutchJob(getConf()); job.setJobName("dedup 3: delete from index(es)"); job.addInputPath(outDir2); job.setInputFormat(SequenceFileInputFormat.class); //job.setInputKeyClass(Text.class); //job.setInputValueClass(IndexDoc.class); job.setInt("io.file.buffer.size", 4096); job.setMapperClass(DeleteDuplicates.class); job.setReducerClass(DeleteDuplicates.class); job.setOutputFormat(DeleteDuplicates.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); JobClient.runJob(job); fs.delete(outDir2); if (LOG.isInfoEnabled()) { LOG.info("Dedup: done"); } }
From source file:org.apache.nutch.tools.FreeGenerator.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: FreeGenerator <inputDir> <segmentsDir> [-filter] [-normalize]"); System.err.println("\tinputDir\tinput directory containing one or more input files."); System.err.println("\t\tEach text file contains a list of URLs, one URL per line"); System.err.println("\tsegmentsDir\toutput directory, where new segment will be created"); System.err.println("\t-filter\trun current URLFilters on input URLs"); System.err.println("\t-normalize\trun current URLNormalizers on input URLs"); return -1; }/*from ww w. jav a 2 s . c om*/ boolean filter = false; boolean normalize = false; if (args.length > 2) { for (int i = 2; i < args.length; i++) { if (args[i].equals("-filter")) { filter = true; } else if (args[i].equals("-normalize")) { normalize = true; } else { LOG.error("Unknown argument: " + args[i] + ", exiting ..."); return -1; } } } SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("FreeGenerator: starting at " + sdf.format(start)); JobConf job = new NutchJob(getConf()); job.setBoolean(FILTER_KEY, filter); job.setBoolean(NORMALIZE_KEY, normalize); FileInputFormat.addInputPath(job, new Path(args[0])); job.setInputFormat(TextInputFormat.class); job.setMapperClass(FG.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Generator.SelectorEntry.class); job.setPartitionerClass(URLPartitioner.class); job.setReducerClass(FG.class); String segName = Generator.generateSegmentName(); job.setNumReduceTasks(job.getNumMapTasks()); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); job.setOutputKeyComparatorClass(Generator.HashComparator.class); FileOutputFormat.setOutputPath(job, new Path(args[1], new Path(segName, CrawlDatum.GENERATE_DIR_NAME))); try { JobClient.runJob(job); } catch (Exception e) { LOG.error("FAILED: " + StringUtils.stringifyException(e)); return -1; } long end = System.currentTimeMillis(); LOG.info("FreeGenerator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); return 0; }