Example usage for org.apache.hadoop.mapred JobConf setPartitionerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setPartitionerClass.

Prototype

public void setPartitionerClass(Class<? extends Partitioner> theClass)

Source Link

Document

Set the Partitioner class used to partition Mapper -outputs to be sent to the Reducer s.

Usage

From source file:net.peacesoft.nutch.crawl.ReGenerator.java

License:Apache License

/**
 * Generate fetchlists in one or more segments. Whether to filter URLs or
 * not is read from the crawl.generate.filter property in the configuration
 * files. If the property is not found, the URLs are filtered. Same for the
 * normalisation.//from  ww  w . jav  a 2s. c  o m
 *
 * @param dbDir Crawl database directory
 * @param segments Segments directory
 * @param numLists Number of reduce tasks
 * @param topN Number of top URLs to be selected
 * @param curTime Current time in milliseconds
 *
 * @return Path to generated segment or null if no entries were selected
 *
 * @throws IOException When an I/O error occurs
 */
public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter,
        boolean norm, boolean force, int maxNumSegments) throws IOException {
    try {
        Path tempDir = new Path(
                getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

        Path lock = new Path(dbDir, CrawlDb.LOCK_NAME);
        FileSystem fs = FileSystem.get(getConf());
        LockUtil.createLockFile(fs, lock, force);

        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long start = System.currentTimeMillis();
        LOG.info("ReGenerator: starting at " + sdf.format(start));
        LOG.info("ReGenerator: Selecting best-scoring urls due for fetch.");
        LOG.info("ReGenerator: filtering: " + filter);
        LOG.info("ReGenerator: normalizing: " + norm);
        if (topN != Long.MAX_VALUE) {
            LOG.info("ReGenerator: topN: " + topN);
        }

        if ("true".equals(getConf().get(GENERATE_MAX_PER_HOST_BY_IP))) {
            LOG.info(
                    "ReGenerator: GENERATE_MAX_PER_HOST_BY_IP will be ignored, use partition.url.mode instead");
        }

        // map to inverted subset due for fetch, sort by score
        JobConf job = new NutchJob(getConf());
        job.setJobName("generate: select from " + dbDir);

        if (numLists == -1) { // for politeness make
            numLists = job.getNumMapTasks(); // a partition per fetch task
        }
        if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) {
            // override
            LOG.info("ReGenerator: jobtracker is 'local', generating exactly one partition.");
            numLists = 1;
        }
        job.setLong(GENERATOR_CUR_TIME, curTime);
        // record real generation time
        long generateTime = System.currentTimeMillis();
        job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
        job.setLong(GENERATOR_TOP_N, topN);
        job.setBoolean(GENERATOR_FILTER, filter);
        job.setBoolean(GENERATOR_NORMALISE, norm);
        job.setInt(GENERATOR_MAX_NUM_SEGMENTS, maxNumSegments);

        FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
        job.setInputFormat(SequenceFileInputFormat.class);

        job.setMapperClass(Selector.class);
        job.setPartitionerClass(Selector.class);
        job.setReducerClass(Selector.class);

        FileOutputFormat.setOutputPath(job, tempDir);
        job.setOutputFormat(SequenceFileOutputFormat.class);
        job.setOutputKeyClass(FloatWritable.class);
        job.setOutputKeyComparatorClass(DecreasingFloatComparator.class);
        job.setOutputValueClass(SelectorEntry.class);
        job.setOutputFormat(GeneratorOutputFormat.class);

        try {
            JobClient.runJob(job);
        } catch (IOException e) {
            throw e;
        }

        // read the subdirectories generated in the temp
        // output and turn them into segments
        List<Path> generatedSegments = new ArrayList<Path>();

        FileStatus[] status = fs.listStatus(tempDir);
        try {
            for (FileStatus stat : status) {
                Path subfetchlist = stat.getPath();
                if (!subfetchlist.getName().startsWith("fetchlist-")) {
                    continue;
                }
                // start a new partition job for this segment
                Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists);
                generatedSegments.add(newSeg);
            }
        } catch (Exception e) {
            LOG.warn("ReGenerator: exception while partitioning segments, exiting ...");
            fs.delete(tempDir, true);
            return null;
        }

        if (generatedSegments.size() == 0) {
            LOG.warn("ReGenerator: 0 records selected for fetching, exiting ...");
            LockUtil.removeLockFile(fs, lock);
            fs.delete(tempDir, true);
            return null;
        }

        if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) {
            // update the db from tempDir
            Path tempDir2 = new Path(
                    getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

            job = new NutchJob(getConf());
            job.setJobName("generate: updatedb " + dbDir);
            job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
            for (Path segmpaths : generatedSegments) {
                Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME);
                FileInputFormat.addInputPath(job, subGenDir);
            }
            FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
            job.setInputFormat(SequenceFileInputFormat.class);
            job.setMapperClass(CrawlDbUpdater.class);
            job.setReducerClass(CrawlDbUpdater.class);
            job.setOutputFormat(MapFileOutputFormat.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(CrawlDatum.class);
            FileOutputFormat.setOutputPath(job, tempDir2);
            try {
                JobClient.runJob(job);
                CrawlDb.install(job, dbDir);
            } catch (IOException e) {
                LockUtil.removeLockFile(fs, lock);
                fs.delete(tempDir, true);
                fs.delete(tempDir2, true);
                throw e;
            }
            fs.delete(tempDir2, true);
        }

        LockUtil.removeLockFile(fs, lock);
        fs.delete(tempDir, true);

        long end = System.currentTimeMillis();
        LOG.info("ReGenerator: finished at " + sdf.format(end) + ", elapsed: "
                + TimingUtil.elapsedTime(start, end));

        Path[] patharray = new Path[generatedSegments.size()];
        return generatedSegments.toArray(patharray);
    } catch (Exception ex) {
        LOG.error("ReGenerator generate error: " + ex.toString(), ex);
        return null;
    }
}

From source file:org.acacia.csr.java.CSRConverter.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (!validArgs(args)) {
        printUsage();//from  ww w . j  av a2 s  .  co  m
        return;
    }
    //These are the temp paths that are created on HDFS
    String dir1 = "/user/miyuru/csrconverter-output";
    String dir2 = "/user/miyuru/csrconverter-output-sorted";

    //We first delete the temporary directories if they exist on the HDFS
    FileSystem fs1 = FileSystem.get(new JobConf());

    System.out.println("Deleting the dir : " + dir1);

    if (fs1.exists(new Path(dir1))) {
        fs1.delete(new Path(dir1), true);
    }

    System.out.println("Done deleting the dir : " + dir1);
    System.out.println("Deleting the dir : " + dir2);
    if (fs1.exists(new Path(dir2))) {
        fs1.delete(new Path(dir2), true);
    }

    Path notinPath = new Path("/user/miyuru/notinverts/notinverts");

    if (!fs1.exists(notinPath)) {
        fs1.create(notinPath);
    }

    System.out.println("Done deleting the dir : " + dir2);

    //Note on Aug 23 2014: Sometimes after this the mapReduce job hangs. need to see why.

    VertexCounterClient.setDefaultGraphID(args[3], args[2]);

    //First job creates the inverted index

    JobConf conf = new JobConf(CSRConverter.class);
    conf.set("org.acacia.partitioner.hbase.zookeeper.quorum", args[1]);
    conf.set("org.acacia.partitioner.hbase.table", args[2]);
    conf.set("org.acacia.partitioner.hbase.contacthost", args[3]);
    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(Text.class);
    //conf.setMapperClass(InvertedMapper.class);
    conf.setReducerClass(InvertedReducer.class);
    //conf.setInputFormat(TextInputFormat.class);
    conf.setInputFormat(NLinesInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    //FileInputFormat.setInputPaths(conf, new Path(args[0]));
    MultipleInputs.addInputPath(conf, new Path(args[0]), NLinesInputFormat.class, InvertedMapper.class);
    MultipleInputs.addInputPath(conf, new Path("/user/miyuru/notinverts/notinverts"), TextInputFormat.class,
            InvertedMapper.class);
    FileOutputFormat.setOutputPath(conf, new Path(dir1));

    //Also for the moment we turn-off the speculative execution
    conf.setBoolean("mapred.map.tasks.speculative.execution", false);
    conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
    conf.setNumMapTasks(96);
    conf.setNumReduceTasks(96);
    conf.setPartitionerClass(VertexPartitioner.class);
    conf.set("vertex-count", args[4]);
    conf.set("zero-flag", args[5]);
    Job job = new Job(conf, "csr_inverter");
    job.setSortComparatorClass(SortComparator.class);
    job.waitForCompletion(true);
}

From source file:org.apache.avro.mapred.tether.TetherJob.java

License:Apache License

private static void setupTetherJob(JobConf job) throws IOException {
    job.setMapRunnerClass(TetherMapRunner.class);
    job.setPartitionerClass(TetherPartitioner.class);
    job.setReducerClass(TetherReducer.class);

    job.setInputFormat(TetherInputFormat.class);
    job.setOutputFormat(TetherOutputFormat.class);

    job.setOutputKeyClass(TetherData.class);
    job.setOutputKeyComparatorClass(TetherKeyComparator.class);
    job.setMapOutputValueClass(NullWritable.class);

    // set the map output key class to TetherData
    job.setMapOutputKeyClass(TetherData.class);

    // add TetherKeySerialization to io.serializations
    Collection<String> serializations = job.getStringCollection("io.serializations");
    if (!serializations.contains(TetherKeySerialization.class.getName())) {
        serializations.add(TetherKeySerialization.class.getName());
        job.setStrings("io.serializations", serializations.toArray(new String[0]));
    }//  w  ww  .  j a va2  s. c o m

    // determine whether the executable should be added to the cache.
    if (job.getBoolean(TETHER_EXEC_CACHED, false)) {
        DistributedCache.addCacheFile(getExecutable(job), job);
    }
}

From source file:org.apache.ignite.internal.processors.hadoop.GridHadoopMapReduceEmbeddedSelfTest.java

License:Apache License

/**
 * Tests whole job execution with all phases in old and new versions of API with definition of custom
 * Serialization, Partitioner and IO formats.
 * @throws Exception If fails./*  w w  w . j a v  a2  s. c  o  m*/
 */
public void testMultiReducerWholeMapReduceExecution() throws Exception {
    IgfsPath inDir = new IgfsPath(PATH_INPUT);

    igfs.mkdirs(inDir);

    IgfsPath inFile = new IgfsPath(inDir, GridHadoopWordCount2.class.getSimpleName() + "-input");

    generateTestFile(inFile.toString(), "key1", 10000, "key2", 20000, "key3", 15000, "key4", 7000, "key5",
            12000, "key6", 18000);

    for (int i = 0; i < 2; i++) {
        boolean useNewAPI = i == 1;

        igfs.delete(new IgfsPath(PATH_OUTPUT), true);

        flags.put("serializationWasConfigured", false);
        flags.put("partitionerWasConfigured", false);
        flags.put("inputFormatWasConfigured", false);
        flags.put("outputFormatWasConfigured", false);

        JobConf jobConf = new JobConf();

        jobConf.set(CommonConfigurationKeys.IO_SERIALIZATIONS_KEY, CustomSerialization.class.getName());

        //To split into about 6-7 items for v2
        jobConf.setInt(FileInputFormat.SPLIT_MAXSIZE, 65000);

        //For v1
        jobConf.setInt("fs.local.block.size", 65000);

        // File system coordinates.
        setupFileSystems(jobConf);

        GridHadoopWordCount1.setTasksClasses(jobConf, !useNewAPI, !useNewAPI, !useNewAPI);

        if (!useNewAPI) {
            jobConf.setPartitionerClass(CustomV1Partitioner.class);
            jobConf.setInputFormat(CustomV1InputFormat.class);
            jobConf.setOutputFormat(CustomV1OutputFormat.class);
        }

        Job job = Job.getInstance(jobConf);

        GridHadoopWordCount2.setTasksClasses(job, useNewAPI, useNewAPI, useNewAPI);

        if (useNewAPI) {
            job.setPartitionerClass(CustomV2Partitioner.class);
            job.setInputFormatClass(CustomV2InputFormat.class);
            job.setOutputFormatClass(CustomV2OutputFormat.class);
        }

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.setInputPaths(job, new Path(igfsScheme() + inFile.toString()));
        FileOutputFormat.setOutputPath(job, new Path(igfsScheme() + PATH_OUTPUT));

        job.setNumReduceTasks(3);

        job.setJarByClass(GridHadoopWordCount2.class);

        IgniteInternalFuture<?> fut = grid(0).hadoop().submit(new GridHadoopJobId(UUID.randomUUID(), 1),
                createJobInfo(job.getConfiguration()));

        fut.get();

        assertTrue("Serialization was configured (new API is " + useNewAPI + ")",
                flags.get("serializationWasConfigured"));

        assertTrue("Partitioner was configured (new API is = " + useNewAPI + ")",
                flags.get("partitionerWasConfigured"));

        assertTrue("Input format was configured (new API is = " + useNewAPI + ")",
                flags.get("inputFormatWasConfigured"));

        assertTrue("Output format was configured (new API is = " + useNewAPI + ")",
                flags.get("outputFormatWasConfigured"));

        assertEquals("Use new API = " + useNewAPI, "key3\t15000\n" + "key6\t18000\n",
                readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00000"));

        assertEquals("Use new API = " + useNewAPI, "key1\t10000\n" + "key4\t7000\n",
                readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00001"));

        assertEquals("Use new API = " + useNewAPI, "key2\t20000\n" + "key5\t12000\n",
                readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00002"));

    }
}

From source file:org.apache.ignite.internal.processors.hadoop.HadoopMapReduceEmbeddedSelfTest.java

License:Apache License

/**
 * Tests whole job execution with all phases in old and new versions of API with definition of custom
 * Serialization, Partitioner and IO formats.
 * @throws Exception If fails./*from  w  ww . j a va 2  s  .com*/
 */
public void testMultiReducerWholeMapReduceExecution() throws Exception {
    IgfsPath inDir = new IgfsPath(PATH_INPUT);

    igfs.mkdirs(inDir);

    IgfsPath inFile = new IgfsPath(inDir, HadoopWordCount2.class.getSimpleName() + "-input");

    generateTestFile(inFile.toString(), "key1", 10000, "key2", 20000, "key3", 15000, "key4", 7000, "key5",
            12000, "key6", 18000);

    for (int i = 0; i < 2; i++) {
        boolean useNewAPI = i == 1;

        igfs.delete(new IgfsPath(PATH_OUTPUT), true);

        flags.put("serializationWasConfigured", false);
        flags.put("partitionerWasConfigured", false);
        flags.put("inputFormatWasConfigured", false);
        flags.put("outputFormatWasConfigured", false);

        JobConf jobConf = new JobConf();

        jobConf.set(CommonConfigurationKeys.IO_SERIALIZATIONS_KEY, CustomSerialization.class.getName());

        //To split into about 6-7 items for v2
        jobConf.setInt(FileInputFormat.SPLIT_MAXSIZE, 65000);

        //For v1
        jobConf.setInt("fs.local.block.size", 65000);

        // File system coordinates.
        setupFileSystems(jobConf);

        HadoopWordCount1.setTasksClasses(jobConf, !useNewAPI, !useNewAPI, !useNewAPI);

        if (!useNewAPI) {
            jobConf.setPartitionerClass(CustomV1Partitioner.class);
            jobConf.setInputFormat(CustomV1InputFormat.class);
            jobConf.setOutputFormat(CustomV1OutputFormat.class);
        }

        Job job = Job.getInstance(jobConf);

        HadoopWordCount2.setTasksClasses(job, useNewAPI, useNewAPI, useNewAPI);

        if (useNewAPI) {
            job.setPartitionerClass(CustomV2Partitioner.class);
            job.setInputFormatClass(CustomV2InputFormat.class);
            job.setOutputFormatClass(CustomV2OutputFormat.class);
        }

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.setInputPaths(job, new Path(igfsScheme() + inFile.toString()));
        FileOutputFormat.setOutputPath(job, new Path(igfsScheme() + PATH_OUTPUT));

        job.setNumReduceTasks(3);

        job.setJarByClass(HadoopWordCount2.class);

        IgniteInternalFuture<?> fut = grid(0).hadoop().submit(new HadoopJobId(UUID.randomUUID(), 1),
                createJobInfo(job.getConfiguration()));

        fut.get();

        assertTrue("Serialization was configured (new API is " + useNewAPI + ")",
                flags.get("serializationWasConfigured"));

        assertTrue("Partitioner was configured (new API is = " + useNewAPI + ")",
                flags.get("partitionerWasConfigured"));

        assertTrue("Input format was configured (new API is = " + useNewAPI + ")",
                flags.get("inputFormatWasConfigured"));

        assertTrue("Output format was configured (new API is = " + useNewAPI + ")",
                flags.get("outputFormatWasConfigured"));

        assertEquals("Use new API = " + useNewAPI, "key3\t15000\n" + "key6\t18000\n",
                readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00000"));

        assertEquals("Use new API = " + useNewAPI, "key1\t10000\n" + "key4\t7000\n",
                readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00001"));

        assertEquals("Use new API = " + useNewAPI, "key2\t20000\n" + "key5\t12000\n",
                readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00002"));

    }
}

From source file:org.apache.ignite.internal.processors.hadoop.impl.HadoopMapReduceEmbeddedSelfTest.java

License:Apache License

/**
 * Tests whole job execution with all phases in old and new versions of API with definition of custom
 * Serialization, Partitioner and IO formats.
 *
 * @param striped Whether output should be striped or not.
 * @throws Exception If fails.//from   w  w w . j av  a  2 s .c  o m
 */
public void checkMultiReducerWholeMapReduceExecution(boolean striped) throws Exception {
    IgfsPath inDir = new IgfsPath(PATH_INPUT);

    igfs.mkdirs(inDir);

    IgfsPath inFile = new IgfsPath(inDir, HadoopWordCount2.class.getSimpleName() + "-input");

    generateTestFile(inFile.toString(), "key1", 10000, "key2", 20000, "key3", 15000, "key4", 7000, "key5",
            12000, "key6", 18000);

    for (int i = 0; i < 2; i++) {
        boolean useNewAPI = i == 1;

        igfs.delete(new IgfsPath(PATH_OUTPUT), true);

        flags.put("serializationWasConfigured", false);
        flags.put("partitionerWasConfigured", false);
        flags.put("inputFormatWasConfigured", false);
        flags.put("outputFormatWasConfigured", false);

        JobConf jobConf = new JobConf();

        if (striped)
            jobConf.set(HadoopJobProperty.SHUFFLE_MAPPER_STRIPED_OUTPUT.propertyName(), "true");
        else
            jobConf.set(HadoopJobProperty.SHUFFLE_MAPPER_STRIPED_OUTPUT.propertyName(), "false");

        jobConf.set(CommonConfigurationKeys.IO_SERIALIZATIONS_KEY, CustomSerialization.class.getName());

        //To split into about 6-7 items for v2
        jobConf.setInt(FileInputFormat.SPLIT_MAXSIZE, 65000);

        //For v1
        jobConf.setInt("fs.local.block.size", 65000);

        // File system coordinates.
        setupFileSystems(jobConf);

        HadoopWordCount1.setTasksClasses(jobConf, !useNewAPI, !useNewAPI, !useNewAPI);

        if (!useNewAPI) {
            jobConf.setPartitionerClass(CustomV1Partitioner.class);
            jobConf.setInputFormat(CustomV1InputFormat.class);
            jobConf.setOutputFormat(CustomV1OutputFormat.class);
        }

        Job job = Job.getInstance(jobConf);

        HadoopWordCount2.setTasksClasses(job, useNewAPI, useNewAPI, useNewAPI, false);

        if (useNewAPI) {
            job.setPartitionerClass(CustomV2Partitioner.class);
            job.setInputFormatClass(CustomV2InputFormat.class);
            job.setOutputFormatClass(CustomV2OutputFormat.class);
        }

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.setInputPaths(job, new Path(igfsScheme() + inFile.toString()));
        FileOutputFormat.setOutputPath(job, new Path(igfsScheme() + PATH_OUTPUT));

        job.setNumReduceTasks(3);

        job.setJarByClass(HadoopWordCount2.class);

        IgniteInternalFuture<?> fut = grid(0).hadoop().submit(new HadoopJobId(UUID.randomUUID(), 1),
                createJobInfo(job.getConfiguration()));

        fut.get();

        assertTrue("Serialization was configured (new API is " + useNewAPI + ")",
                flags.get("serializationWasConfigured"));

        assertTrue("Partitioner was configured (new API is = " + useNewAPI + ")",
                flags.get("partitionerWasConfigured"));

        assertTrue("Input format was configured (new API is = " + useNewAPI + ")",
                flags.get("inputFormatWasConfigured"));

        assertTrue("Output format was configured (new API is = " + useNewAPI + ")",
                flags.get("outputFormatWasConfigured"));

        assertEquals("Use new API = " + useNewAPI, "key3\t15000\n" + "key6\t18000\n",
                readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00000"));

        assertEquals("Use new API = " + useNewAPI, "key1\t10000\n" + "key4\t7000\n",
                readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00001"));

        assertEquals("Use new API = " + useNewAPI, "key2\t20000\n" + "key5\t12000\n",
                readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00002"));

    }
}

From source file:org.apache.mahout.classifier.bayes.mapreduce.common.BayesFeatureDriver.java

License:Apache License

@Override
public void runJob(Path input, Path output, BayesParameters params) throws IOException {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(BayesFeatureDriver.class);
    conf.setJobName("Bayes Feature Driver running over input: " + input);
    conf.setOutputKeyClass(StringTuple.class);
    conf.setOutputValueClass(DoubleWritable.class);
    conf.setPartitionerClass(FeaturePartitioner.class);
    conf.setOutputKeyComparatorClass(FeatureLabelComparator.class);
    FileInputFormat.setInputPaths(conf, input);
    FileOutputFormat.setOutputPath(conf, output);

    conf.setMapperClass(BayesFeatureMapper.class);

    conf.setInputFormat(KeyValueTextInputFormat.class);
    conf.setCombinerClass(BayesFeatureCombiner.class);
    conf.setReducerClass(BayesFeatureReducer.class);
    conf.setOutputFormat(BayesFeatureOutputFormat.class);
    conf.set("io.serializations",
            "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
    // this conf parameter needs to be set enable serialisation of conf values

    HadoopUtil.overwriteOutput(output);//from   w ww .  ja v  a 2s  .  co  m
    conf.set("bayes.parameters", params.toString());

    client.setConf(conf);
    JobClient.runJob(conf);

}

From source file:org.apache.nutch.crawl.Generator.java

License:Apache License

/**
 * Generate fetchlists in one or more segments. Whether to filter URLs or not
 * is read from the crawl.generate.filter property in the configuration files.
 * If the property is not found, the URLs are filtered. Same for the
 * normalisation./*w w w .j  ava  2 s.  co  m*/
 * 
 * @param dbDir
 *          Crawl database directory
 * @param segments
 *          Segments directory
 * @param numLists
 *          Number of reduce tasks
 * @param topN
 *          Number of top URLs to be selected
 * @param curTime
 *          Current time in milliseconds
 * 
 * @return Path to generated segment or null if no entries were selected
 * 
 * @throws IOException
 *           When an I/O error occurs
 */
public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter,
        boolean norm, boolean force, int maxNumSegments) throws IOException {

    Path tempDir = new Path(
            getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

    Path lock = new Path(dbDir, CrawlDb.LOCK_NAME);
    FileSystem fs = FileSystem.get(getConf());
    LockUtil.createLockFile(fs, lock, force);

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("Generator: starting at " + sdf.format(start));
    LOG.info("Generator: Selecting best-scoring urls due for fetch.");
    LOG.info("Generator: filtering: " + filter);
    LOG.info("Generator: normalizing: " + norm);
    if (topN != Long.MAX_VALUE) {
        LOG.info("Generator: topN: " + topN);
    }

    if ("true".equals(getConf().get(GENERATE_MAX_PER_HOST_BY_IP))) {
        LOG.info("Generator: GENERATE_MAX_PER_HOST_BY_IP will be ignored, use partition.url.mode instead");
    }

    // map to inverted subset due for fetch, sort by score
    JobConf job = new NutchJob(getConf());
    job.setJobName("generate: select from " + dbDir);

    if (numLists == -1) { // for politeness make
        numLists = job.getNumMapTasks(); // a partition per fetch task
    }
    if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) {
        // override
        LOG.info("Generator: jobtracker is 'local', generating exactly one partition.");
        numLists = 1;
    }
    job.setLong(GENERATOR_CUR_TIME, curTime);
    // record real generation time
    long generateTime = System.currentTimeMillis();
    job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
    job.setLong(GENERATOR_TOP_N, topN);
    job.setBoolean(GENERATOR_FILTER, filter);
    job.setBoolean(GENERATOR_NORMALISE, norm);
    job.setInt(GENERATOR_MAX_NUM_SEGMENTS, maxNumSegments);

    FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(Selector.class);
    job.setPartitionerClass(Selector.class);
    job.setReducerClass(Selector.class);

    FileOutputFormat.setOutputPath(job, tempDir);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(FloatWritable.class);
    job.setOutputKeyComparatorClass(DecreasingFloatComparator.class);
    job.setOutputValueClass(SelectorEntry.class);
    job.setOutputFormat(GeneratorOutputFormat.class);

    try {
        JobClient.runJob(job);
    } catch (IOException e) {
        throw e;
    }

    // read the subdirectories generated in the temp
    // output and turn them into segments
    List<Path> generatedSegments = new ArrayList<Path>();

    FileStatus[] status = fs.listStatus(tempDir);
    try {
        for (FileStatus stat : status) {
            Path subfetchlist = stat.getPath();
            if (!subfetchlist.getName().startsWith("fetchlist-"))
                continue;
            // start a new partition job for this segment
            Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists);
            generatedSegments.add(newSeg);
        }
    } catch (Exception e) {
        LOG.warn("Generator: exception while partitioning segments, exiting ...");
        fs.delete(tempDir, true);
        return null;
    }

    if (generatedSegments.size() == 0) {
        LOG.warn("Generator: 0 records selected for fetching, exiting ...");
        LockUtil.removeLockFile(fs, lock);
        fs.delete(tempDir, true);
        return null;
    }

    if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) {
        // update the db from tempDir
        Path tempDir2 = new Path(
                getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

        job = new NutchJob(getConf());
        job.setJobName("generate: updatedb " + dbDir);
        job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
        for (Path segmpaths : generatedSegments) {
            Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME);
            FileInputFormat.addInputPath(job, subGenDir);
        }
        FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
        job.setInputFormat(SequenceFileInputFormat.class);
        job.setMapperClass(CrawlDbUpdater.class);
        job.setReducerClass(CrawlDbUpdater.class);
        job.setOutputFormat(MapFileOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(CrawlDatum.class);
        FileOutputFormat.setOutputPath(job, tempDir2);
        try {
            JobClient.runJob(job);
            CrawlDb.install(job, dbDir);
        } catch (IOException e) {
            LockUtil.removeLockFile(fs, lock);
            fs.delete(tempDir, true);
            fs.delete(tempDir2, true);
            throw e;
        }
        fs.delete(tempDir2, true);
    }

    LockUtil.removeLockFile(fs, lock);
    fs.delete(tempDir, true);

    long end = System.currentTimeMillis();
    LOG.info("Generator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));

    Path[] patharray = new Path[generatedSegments.size()];
    return generatedSegments.toArray(patharray);
}

From source file:org.apache.nutch.indexer.DeleteDuplicates.java

License:Apache License

public void dedup(Path[] indexDirs) throws IOException {

    if (LOG.isInfoEnabled()) {
        LOG.info("Dedup: starting");
    }//from  w w  w .j  a v  a  2s.  c om

    Path outDir1 = new Path("dedup-urls-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf job = new NutchJob(getConf());

    for (int i = 0; i < indexDirs.length; i++) {
        if (LOG.isInfoEnabled()) {
            LOG.info("Dedup: adding indexes in: " + indexDirs[i]);
        }
        job.addInputPath(indexDirs[i]);
    }
    job.setJobName("dedup 1: urls by time");

    job.setInputFormat(InputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IndexDoc.class);

    job.setReducerClass(UrlsReducer.class);
    job.setOutputPath(outDir1);

    job.setOutputKeyClass(MD5Hash.class);
    job.setOutputValueClass(IndexDoc.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    JobClient.runJob(job);

    Path outDir2 = new Path("dedup-hash-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
    job = new NutchJob(getConf());
    job.setJobName("dedup 2: content by hash");

    job.addInputPath(outDir1);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapOutputKeyClass(MD5Hash.class);
    job.setMapOutputValueClass(IndexDoc.class);
    job.setPartitionerClass(HashPartitioner.class);
    job.setSpeculativeExecution(false);

    job.setReducerClass(HashReducer.class);
    job.setOutputPath(outDir2);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IndexDoc.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    JobClient.runJob(job);

    // remove outDir1 - no longer needed
    fs.delete(outDir1);

    job = new NutchJob(getConf());
    job.setJobName("dedup 3: delete from index(es)");

    job.addInputPath(outDir2);
    job.setInputFormat(SequenceFileInputFormat.class);
    //job.setInputKeyClass(Text.class);
    //job.setInputValueClass(IndexDoc.class);

    job.setInt("io.file.buffer.size", 4096);
    job.setMapperClass(DeleteDuplicates.class);
    job.setReducerClass(DeleteDuplicates.class);

    job.setOutputFormat(DeleteDuplicates.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    JobClient.runJob(job);

    fs.delete(outDir2);

    if (LOG.isInfoEnabled()) {
        LOG.info("Dedup: done");
    }
}

From source file:org.apache.nutch.tools.FreeGenerator.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: FreeGenerator <inputDir> <segmentsDir> [-filter] [-normalize]");
        System.err.println("\tinputDir\tinput directory containing one or more input files.");
        System.err.println("\t\tEach text file contains a list of URLs, one URL per line");
        System.err.println("\tsegmentsDir\toutput directory, where new segment will be created");
        System.err.println("\t-filter\trun current URLFilters on input URLs");
        System.err.println("\t-normalize\trun current URLNormalizers on input URLs");
        return -1;
    }/*from   ww  w. jav  a  2 s  .  c om*/
    boolean filter = false;
    boolean normalize = false;
    if (args.length > 2) {
        for (int i = 2; i < args.length; i++) {
            if (args[i].equals("-filter")) {
                filter = true;
            } else if (args[i].equals("-normalize")) {
                normalize = true;
            } else {
                LOG.error("Unknown argument: " + args[i] + ", exiting ...");
                return -1;
            }
        }
    }

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("FreeGenerator: starting at " + sdf.format(start));

    JobConf job = new NutchJob(getConf());
    job.setBoolean(FILTER_KEY, filter);
    job.setBoolean(NORMALIZE_KEY, normalize);
    FileInputFormat.addInputPath(job, new Path(args[0]));
    job.setInputFormat(TextInputFormat.class);
    job.setMapperClass(FG.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Generator.SelectorEntry.class);
    job.setPartitionerClass(URLPartitioner.class);
    job.setReducerClass(FG.class);
    String segName = Generator.generateSegmentName();
    job.setNumReduceTasks(job.getNumMapTasks());
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);
    job.setOutputKeyComparatorClass(Generator.HashComparator.class);
    FileOutputFormat.setOutputPath(job, new Path(args[1], new Path(segName, CrawlDatum.GENERATE_DIR_NAME)));
    try {
        JobClient.runJob(job);
    } catch (Exception e) {
        LOG.error("FAILED: " + StringUtils.stringifyException(e));
        return -1;
    }
    long end = System.currentTimeMillis();
    LOG.info("FreeGenerator: finished at " + sdf.format(end) + ", elapsed: "
            + TimingUtil.elapsedTime(start, end));
    return 0;
}