Example usage for org.apache.hadoop.mapred JobConf setBoolean

List of usage examples for org.apache.hadoop.mapred JobConf setBoolean

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setBoolean.

Prototype

public void setBoolean(String name, boolean value) 

Source Link

Document

Set the value of the name property to a boolean.

Usage

From source file:org.apache.crunch.io.orc.OrcFileReaderFactory.java

License:Apache License

@Override
public Iterator<T> read(FileSystem fs, final Path path) {
    try {/*w ww . j a v a 2  s  .  com*/
        if (!fs.isFile(path)) {
            throw new CrunchRuntimeException("Not a file: " + path);
        }

        inputFn.initialize();

        FileStatus status = fs.getFileStatus(path);
        FileSplit split = new FileSplit(path, 0, status.getLen(), new String[0]);

        JobConf conf = new JobConf();
        if (readColumns != null) {
            conf.setBoolean(OrcFileSource.HIVE_READ_ALL_COLUMNS, false);
            conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR,
                    OrcFileSource.getColumnIdsStr(readColumns));
        }
        final RecordReader<NullWritable, OrcStruct> reader = inputFormat.getRecordReader(split, conf,
                Reporter.NULL);

        return new UnmodifiableIterator<T>() {

            private boolean checked = false;
            private boolean hasNext;
            private OrcStruct value;
            private OrcWritable writable = new OrcWritable();

            @Override
            public boolean hasNext() {
                try {
                    if (value == null) {
                        value = reader.createValue();
                    }
                    if (!checked) {
                        hasNext = reader.next(NullWritable.get(), value);
                        checked = true;
                    }
                    return hasNext;
                } catch (Exception e) {
                    throw new CrunchRuntimeException("Error while reading local file: " + path, e);
                }
            }

            @Override
            public T next() {
                try {
                    if (value == null) {
                        value = reader.createValue();
                    }
                    if (!checked) {
                        reader.next(NullWritable.get(), value);
                    }
                    checked = false;
                    writable.set(value);
                    return inputFn.map(writable);
                } catch (Exception e) {
                    throw new CrunchRuntimeException("Error while reading local file: " + path, e);
                }
            }

        };
    } catch (Exception e) {
        throw new CrunchRuntimeException("Error while reading local file: " + path, e);
    }
}

From source file:org.apache.hive.streaming.TestStreaming.java

License:Apache License

/**
 * @deprecated use {@link #checkDataWritten2(Path, long, long, int, String, boolean, String...)} -
 * there is little value in using InputFormat directly
 *//*from  w  ww.jav  a2s .  c o  m*/
@Deprecated
private void checkDataWritten(Path partitionPath, long minTxn, long maxTxn, int buckets, int numExpectedFiles,
        String... records) throws Exception {
    ValidWriteIdList writeIds = getTransactionContext(conf);
    AcidUtils.Directory dir = AcidUtils.getAcidState(partitionPath, conf, writeIds);
    Assert.assertEquals(0, dir.getObsolete().size());
    Assert.assertEquals(0, dir.getOriginalFiles().size());
    List<AcidUtils.ParsedDelta> current = dir.getCurrentDirectories();
    System.out.println("Files found: ");
    for (AcidUtils.ParsedDelta pd : current) {
        System.out.println(pd.getPath().toString());
    }
    Assert.assertEquals(numExpectedFiles, current.size());

    // find the absolute minimum transaction
    long min = Long.MAX_VALUE;
    long max = Long.MIN_VALUE;
    for (AcidUtils.ParsedDelta pd : current) {
        if (pd.getMaxWriteId() > max) {
            max = pd.getMaxWriteId();
        }
        if (pd.getMinWriteId() < min) {
            min = pd.getMinWriteId();
        }
    }
    Assert.assertEquals(minTxn, min);
    Assert.assertEquals(maxTxn, max);

    InputFormat inf = new OrcInputFormat();
    JobConf job = new JobConf();
    job.set("mapred.input.dir", partitionPath.toString());
    job.set(BUCKET_COUNT, Integer.toString(buckets));
    job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, "id,msg");
    job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, "bigint:string");
    AcidUtils.setAcidOperationalProperties(job, true, null);
    job.setBoolean(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL, true);
    job.set(ValidWriteIdList.VALID_WRITEIDS_KEY, writeIds.writeToString());
    job.set(ValidTxnList.VALID_TXNS_KEY, conf.get(ValidTxnList.VALID_TXNS_KEY));
    InputSplit[] splits = inf.getSplits(job, buckets);
    Assert.assertEquals(numExpectedFiles, splits.length);
    org.apache.hadoop.mapred.RecordReader<NullWritable, OrcStruct> rr = inf.getRecordReader(splits[0], job,
            Reporter.NULL);

    NullWritable key = rr.createKey();
    OrcStruct value = rr.createValue();
    for (String record : records) {
        Assert.assertEquals(true, rr.next(key, value));
        Assert.assertEquals(record, value.toString());
    }
    Assert.assertEquals(false, rr.next(key, value));
}

From source file:org.apache.ignite.internal.processors.hadoop.impl.HadoopTeraSortTest.java

License:Apache License

/**
 * Does actual test TeraSort job Through Ignite API
 *
 * @param gzip Whether to use GZIP./*from   w  w w.j  a  v  a2 s  .com*/
 */
protected final void teraSort(boolean gzip) throws Exception {
    System.out.println("TeraSort ===============================================================");

    getFileSystem().delete(new Path(sortOutDir), true);

    final JobConf jobConf = new JobConf();

    jobConf.setUser(getUser());

    jobConf.set("fs.defaultFS", getFsBase());

    log().info("Desired number of reduces: " + numReduces());

    jobConf.set("mapreduce.job.reduces", String.valueOf(numReduces()));

    log().info("Desired number of maps: " + numMaps());

    final long splitSize = dataSizeBytes() / numMaps();

    log().info("Desired split size: " + splitSize);

    // Force the split to be of the desired size:
    jobConf.set("mapred.min.split.size", String.valueOf(splitSize));
    jobConf.set("mapred.max.split.size", String.valueOf(splitSize));

    jobConf.setBoolean(HadoopJobProperty.SHUFFLE_MAPPER_STRIPED_OUTPUT.propertyName(), true);
    jobConf.setInt(HadoopJobProperty.SHUFFLE_MSG_SIZE.propertyName(), 4096);

    if (gzip)
        jobConf.setBoolean(HadoopJobProperty.SHUFFLE_MSG_GZIP.propertyName(), true);

    jobConf.set(HadoopJobProperty.JOB_PARTIALLY_RAW_COMPARATOR.propertyName(),
            TextPartiallyRawComparator.class.getName());

    Job job = setupConfig(jobConf);

    HadoopJobId jobId = new HadoopJobId(UUID.randomUUID(), 1);

    IgniteInternalFuture<?> fut = grid(0).hadoop().submit(jobId, createJobInfo(job.getConfiguration()));

    fut.get();
}

From source file:org.apache.mahout.avro.text.mapred.WikipediaToAvroDocuments.java

License:Apache License

/**
 * Run the job/*  www.  java2  s  . com*/
 * 
 * @param input
 *          the input pathname String
 * @param output
 *          the output pathname String
 * @param catFile
 *          the file containing the Wikipedia categories
 * @param exactMatchOnly
 *          if true, then the Wikipedia category must match exactly instead of
 *          simply containing the category string
 * @param all
 *          if true select all categories
 */
public static int runJob(String input, String output, String catFile, boolean exactMatchOnly, boolean all)
        throws IOException {
    JobClient client = new JobClient();
    JobConf conf = new JobConf(WikipediaToAvroDocuments.class);
    if (log.isInfoEnabled()) {
        log.info("Input: " + input + " Out: " + output + " Categories: " + catFile + " All Files: " + all);
    }

    Path inPath = new Path(input);
    Path outPath = new Path(output);

    FileInputFormat.setInputPaths(conf, inPath);
    FileOutputFormat.setOutputPath(conf, outPath);
    //AvroOutputFormat.setClass(conf, AvroDocument.class);
    //AvroOutputFormat.setSchema(conf, AvroDocument._SCHEMA);

    conf.set("xmlinput.start", "<page>");
    conf.set("xmlinput.end", "</page>");
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(AvroDocument.class);
    conf.setBoolean("exact.match.only", exactMatchOnly);
    conf.setBoolean("all.files", all);
    conf.setMapperClass(WikipediaAvroDocumentMapper.class);
    conf.setInputFormat(XmlInputFormat.class);
    conf.setReducerClass(IdentityReducer.class);
    conf.setOutputFormat(AvroOutputFormat.class);

    AvroOutputFormat.setAvroOutputClass(conf, AvroDocument.class);

    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
    if (dfs.exists(outPath)) {
        dfs.delete(outPath, true);
    }

    Set<String> categories = new HashSet<String>();
    if (catFile.equals("") == false) {
        for (String line : new FileLineIterable(new File(catFile))) {
            categories.add(line.trim().toLowerCase());
        }
    }

    DefaultStringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(conf,
            GenericsUtil.getClass(categories));

    String categoriesStr = setStringifier.toString(categories);

    conf.set("wikipedia.categories", categoriesStr);

    client.setConf(conf);
    RunningJob job = JobClient.runJob(conf);
    job.waitForCompletion();
    return job.isSuccessful() ? 1 : 0;
}

From source file:org.apache.mahout.math.hadoop.stochasticsvd.BtJob.java

License:Apache License

public static void run(Configuration conf, Path[] inputPathA, Path inputPathQJob, Path xiPath, Path outputPath,
        int minSplitSize, int k, int p, int btBlockHeight, int numReduceTasks, boolean broadcast,
        Class<? extends Writable> labelClass, boolean outputBBtProducts)
        throws ClassNotFoundException, InterruptedException, IOException {

    JobConf oldApiJob = new JobConf(conf);

    MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_Q, org.apache.hadoop.mapred.SequenceFileOutputFormat.class,
            labelClass, VectorWritable.class);

    if (outputBBtProducts) {
        MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_BBT,
                org.apache.hadoop.mapred.SequenceFileOutputFormat.class, IntWritable.class,
                VectorWritable.class);
        /*//from   w  w  w  .  j a  v a 2 s  . c o  m
         * MAHOUT-1067: if we are asked to output BBT products then named vector
         * names should be propagated to Q too so that UJob could pick them up
         * from there.
         */
        oldApiJob.setBoolean(PROP_NV, true);
    }
    if (xiPath != null) {
        // compute pca -related stuff as well
        MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_SQ,
                org.apache.hadoop.mapred.SequenceFileOutputFormat.class, IntWritable.class,
                VectorWritable.class);
        MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_SB,
                org.apache.hadoop.mapred.SequenceFileOutputFormat.class, IntWritable.class,
                VectorWritable.class);
    }

    /*
     * HACK: we use old api multiple outputs since they are not available in the
     * new api of either 0.20.2 or 0.20.203 but wrap it into a new api job so we
     * can use new api interfaces.
     */

    Job job = new Job(oldApiJob);
    job.setJobName("Bt-job");
    job.setJarByClass(BtJob.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileInputFormat.setInputPaths(job, inputPathA);
    if (minSplitSize > 0) {
        FileInputFormat.setMinInputSplitSize(job, minSplitSize);
    }
    FileOutputFormat.setOutputPath(job, outputPath);

    // WARN: tight hadoop integration here:
    job.getConfiguration().set("mapreduce.output.basename", OUTPUT_BT);

    FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(SparseRowBlockWritable.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(BtMapper.class);
    job.setCombinerClass(OuterProductCombiner.class);
    job.setReducerClass(OuterProductReducer.class);

    job.getConfiguration().setInt(QJob.PROP_K, k);
    job.getConfiguration().setInt(QJob.PROP_P, p);
    job.getConfiguration().set(PROP_QJOB_PATH, inputPathQJob.toString());
    job.getConfiguration().setBoolean(PROP_OUPTUT_BBT_PRODUCTS, outputBBtProducts);
    job.getConfiguration().setInt(PROP_OUTER_PROD_BLOCK_HEIGHT, btBlockHeight);

    job.setNumReduceTasks(numReduceTasks);

    /*
     * PCA-related options, MAHOUT-817
     */
    if (xiPath != null) {
        job.getConfiguration().set(PROP_XI_PATH, xiPath.toString());
    }

    /*
     * we can broadhast Rhat files since all of them are reuqired by each job,
     * but not Q files which correspond to splits of A (so each split of A will
     * require only particular Q file, each time different one).
     */

    if (broadcast) {
        job.getConfiguration().set(PROP_RHAT_BROADCAST, "y");

        FileSystem fs = FileSystem.get(inputPathQJob.toUri(), conf);
        FileStatus[] fstats = fs.globStatus(new Path(inputPathQJob, QJob.OUTPUT_RHAT + "-*"));
        if (fstats != null) {
            for (FileStatus fstat : fstats) {
                /*
                 * new api is not enabled yet in our dependencies at this time, still
                 * using deprecated one
                 */
                DistributedCache.addCacheFile(fstat.getPath().toUri(), job.getConfiguration());
            }
        }
    }

    job.submit();
    job.waitForCompletion(false);

    if (!job.isSuccessful()) {
        throw new IOException("Bt job unsuccessful.");
    }
}

From source file:org.apache.nutch.crawl.CrawlDb.java

License:Apache License

public void update(Path crawlDb, Path[] segments, boolean normalize, boolean filter, boolean additionsAllowed,
        boolean force) throws IOException {
    FileSystem fs = FileSystem.get(getConf());
    Path lock = new Path(crawlDb, LOCK_NAME);
    LockUtil.createLockFile(fs, lock, force);
    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();

    JobConf job = CrawlDb.createJob(getConf(), crawlDb);
    job.setBoolean(CRAWLDB_ADDITIONS_ALLOWED, additionsAllowed);
    job.setBoolean(CrawlDbFilter.URL_FILTERING, filter);
    job.setBoolean(CrawlDbFilter.URL_NORMALIZING, normalize);

    boolean url404Purging = job.getBoolean(CRAWLDB_PURGE_404, false);

    if (LOG.isInfoEnabled()) {
        LOG.info("CrawlDb update: starting at " + sdf.format(start));
        LOG.info("CrawlDb update: db: " + crawlDb);
        LOG.info("CrawlDb update: segments: " + Arrays.asList(segments));
        LOG.info("CrawlDb update: additions allowed: " + additionsAllowed);
        LOG.info("CrawlDb update: URL normalizing: " + normalize);
        LOG.info("CrawlDb update: URL filtering: " + filter);
        LOG.info("CrawlDb update: 404 purging: " + url404Purging);
    }/*from  w  w w  . j a  va2  s  .  c  o  m*/

    for (int i = 0; i < segments.length; i++) {
        Path fetch = new Path(segments[i], CrawlDatum.FETCH_DIR_NAME);
        Path parse = new Path(segments[i], CrawlDatum.PARSE_DIR_NAME);
        if (fs.exists(fetch) && fs.exists(parse)) {
            FileInputFormat.addInputPath(job, fetch);
            FileInputFormat.addInputPath(job, parse);
        } else {
            LOG.info(" - skipping invalid segment " + segments[i]);
        }
    }

    if (LOG.isInfoEnabled()) {
        LOG.info("CrawlDb update: Merging segment data into db.");
    }
    try {
        JobClient.runJob(job);
    } catch (IOException e) {
        LockUtil.removeLockFile(fs, lock);
        Path outPath = FileOutputFormat.getOutputPath(job);
        if (fs.exists(outPath))
            fs.delete(outPath, true);
        throw e;
    }

    CrawlDb.install(job, crawlDb);
    long end = System.currentTimeMillis();
    LOG.info("CrawlDb update: finished at " + sdf.format(end) + ", elapsed: "
            + TimingUtil.elapsedTime(start, end));
}

From source file:org.apache.nutch.crawl.CrawlDbMerger.java

License:Apache License

public static JobConf createMergeJob(Configuration conf, Path output, boolean normalize, boolean filter) {
    Path newCrawlDb = new Path("crawldb-merge-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf job = new NutchJob(conf);
    job.setJobName("crawldb merge " + output);

    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(CrawlDbFilter.class);
    job.setBoolean(CrawlDbFilter.URL_FILTERING, filter);
    job.setBoolean(CrawlDbFilter.URL_NORMALIZING, normalize);
    job.setReducerClass(Merger.class);

    FileOutputFormat.setOutputPath(job, newCrawlDb);
    job.setOutputFormat(MapFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);

    return job;// w  w w .  j a v  a2s. c  om
}

From source file:org.apache.nutch.crawl.CrawlDbReader.java

License:Apache License

public void processStatJob(String crawlDb, Configuration config, boolean sort) throws IOException {

    if (LOG.isInfoEnabled()) {
        LOG.info("CrawlDb statistics start: " + crawlDb);
    }//from ww  w  .j av  a2  s  . co m

    Path tmpFolder = new Path(crawlDb, "stat_tmp" + System.currentTimeMillis());

    JobConf job = new NutchJob(config);
    job.setJobName("stats " + crawlDb);
    job.setBoolean("db.reader.stats.sort", sort);

    FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(CrawlDbStatMapper.class);
    job.setCombinerClass(CrawlDbStatCombiner.class);
    job.setReducerClass(CrawlDbStatReducer.class);

    FileOutputFormat.setOutputPath(job, tmpFolder);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    // https://issues.apache.org/jira/browse/NUTCH-1029
    job.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    JobClient.runJob(job);

    // reading the result
    FileSystem fileSystem = FileSystem.get(config);
    SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(config, tmpFolder);

    Text key = new Text();
    LongWritable value = new LongWritable();

    TreeMap<String, LongWritable> stats = new TreeMap<String, LongWritable>();
    for (int i = 0; i < readers.length; i++) {
        SequenceFile.Reader reader = readers[i];
        while (reader.next(key, value)) {
            String k = key.toString();
            LongWritable val = stats.get(k);
            if (val == null) {
                val = new LongWritable();
                if (k.equals("scx"))
                    val.set(Long.MIN_VALUE);
                if (k.equals("scn"))
                    val.set(Long.MAX_VALUE);
                stats.put(k, val);
            }
            if (k.equals("scx")) {
                if (val.get() < value.get())
                    val.set(value.get());
            } else if (k.equals("scn")) {
                if (val.get() > value.get())
                    val.set(value.get());
            } else {
                val.set(val.get() + value.get());
            }
        }
        reader.close();
    }

    if (LOG.isInfoEnabled()) {
        LOG.info("Statistics for CrawlDb: " + crawlDb);
        LongWritable totalCnt = stats.get("T");
        stats.remove("T");
        LOG.info("TOTAL urls:\t" + totalCnt.get());
        for (Map.Entry<String, LongWritable> entry : stats.entrySet()) {
            String k = entry.getKey();
            LongWritable val = entry.getValue();
            if (k.equals("scn")) {
                LOG.info("min score:\t" + (float) (val.get() / 1000.0f));
            } else if (k.equals("scx")) {
                LOG.info("max score:\t" + (float) (val.get() / 1000.0f));
            } else if (k.equals("sct")) {
                LOG.info("avg score:\t" + (float) ((((double) val.get()) / totalCnt.get()) / 1000.0));
            } else if (k.startsWith("status")) {
                String[] st = k.split(" ");
                int code = Integer.parseInt(st[1]);
                if (st.length > 2)
                    LOG.info("   " + st[2] + " :\t" + val);
                else
                    LOG.info(st[0] + " " + code + " (" + CrawlDatum.getStatusName((byte) code) + "):\t" + val);
            } else
                LOG.info(k + ":\t" + val);
        }
    }
    // removing the tmp folder
    fileSystem.delete(tmpFolder, true);
    if (LOG.isInfoEnabled()) {
        LOG.info("CrawlDb statistics: done");
    }

}

From source file:org.apache.nutch.crawl.Generator.java

License:Apache License

/**
 * Generate fetchlists in one or more segments. Whether to filter URLs or not
 * is read from the crawl.generate.filter property in the configuration files.
 * If the property is not found, the URLs are filtered. Same for the
 * normalisation./*from  w  w  w  .  j a  v a 2 s  .  co m*/
 * 
 * @param dbDir
 *          Crawl database directory
 * @param segments
 *          Segments directory
 * @param numLists
 *          Number of reduce tasks
 * @param topN
 *          Number of top URLs to be selected
 * @param curTime
 *          Current time in milliseconds
 * 
 * @return Path to generated segment or null if no entries were selected
 * 
 * @throws IOException
 *           When an I/O error occurs
 */
public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter,
        boolean norm, boolean force, int maxNumSegments) throws IOException {

    Path tempDir = new Path(
            getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

    Path lock = new Path(dbDir, CrawlDb.LOCK_NAME);
    FileSystem fs = FileSystem.get(getConf());
    LockUtil.createLockFile(fs, lock, force);

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("Generator: starting at " + sdf.format(start));
    LOG.info("Generator: Selecting best-scoring urls due for fetch.");
    LOG.info("Generator: filtering: " + filter);
    LOG.info("Generator: normalizing: " + norm);
    if (topN != Long.MAX_VALUE) {
        LOG.info("Generator: topN: " + topN);
    }

    if ("true".equals(getConf().get(GENERATE_MAX_PER_HOST_BY_IP))) {
        LOG.info("Generator: GENERATE_MAX_PER_HOST_BY_IP will be ignored, use partition.url.mode instead");
    }

    // map to inverted subset due for fetch, sort by score
    JobConf job = new NutchJob(getConf());
    job.setJobName("generate: select from " + dbDir);

    if (numLists == -1) { // for politeness make
        numLists = job.getNumMapTasks(); // a partition per fetch task
    }
    if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) {
        // override
        LOG.info("Generator: jobtracker is 'local', generating exactly one partition.");
        numLists = 1;
    }
    job.setLong(GENERATOR_CUR_TIME, curTime);
    // record real generation time
    long generateTime = System.currentTimeMillis();
    job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
    job.setLong(GENERATOR_TOP_N, topN);
    job.setBoolean(GENERATOR_FILTER, filter);
    job.setBoolean(GENERATOR_NORMALISE, norm);
    job.setInt(GENERATOR_MAX_NUM_SEGMENTS, maxNumSegments);

    FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(Selector.class);
    job.setPartitionerClass(Selector.class);
    job.setReducerClass(Selector.class);

    FileOutputFormat.setOutputPath(job, tempDir);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(FloatWritable.class);
    job.setOutputKeyComparatorClass(DecreasingFloatComparator.class);
    job.setOutputValueClass(SelectorEntry.class);
    job.setOutputFormat(GeneratorOutputFormat.class);

    try {
        JobClient.runJob(job);
    } catch (IOException e) {
        throw e;
    }

    // read the subdirectories generated in the temp
    // output and turn them into segments
    List<Path> generatedSegments = new ArrayList<Path>();

    FileStatus[] status = fs.listStatus(tempDir);
    try {
        for (FileStatus stat : status) {
            Path subfetchlist = stat.getPath();
            if (!subfetchlist.getName().startsWith("fetchlist-"))
                continue;
            // start a new partition job for this segment
            Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists);
            generatedSegments.add(newSeg);
        }
    } catch (Exception e) {
        LOG.warn("Generator: exception while partitioning segments, exiting ...");
        fs.delete(tempDir, true);
        return null;
    }

    if (generatedSegments.size() == 0) {
        LOG.warn("Generator: 0 records selected for fetching, exiting ...");
        LockUtil.removeLockFile(fs, lock);
        fs.delete(tempDir, true);
        return null;
    }

    if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) {
        // update the db from tempDir
        Path tempDir2 = new Path(
                getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

        job = new NutchJob(getConf());
        job.setJobName("generate: updatedb " + dbDir);
        job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
        for (Path segmpaths : generatedSegments) {
            Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME);
            FileInputFormat.addInputPath(job, subGenDir);
        }
        FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
        job.setInputFormat(SequenceFileInputFormat.class);
        job.setMapperClass(CrawlDbUpdater.class);
        job.setReducerClass(CrawlDbUpdater.class);
        job.setOutputFormat(MapFileOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(CrawlDatum.class);
        FileOutputFormat.setOutputPath(job, tempDir2);
        try {
            JobClient.runJob(job);
            CrawlDb.install(job, dbDir);
        } catch (IOException e) {
            LockUtil.removeLockFile(fs, lock);
            fs.delete(tempDir, true);
            fs.delete(tempDir2, true);
            throw e;
        }
        fs.delete(tempDir2, true);
    }

    LockUtil.removeLockFile(fs, lock);
    fs.delete(tempDir, true);

    long end = System.currentTimeMillis();
    LOG.info("Generator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));

    Path[] patharray = new Path[generatedSegments.size()];
    return generatedSegments.toArray(patharray);
}

From source file:org.apache.nutch.crawl.LinkDb.java

License:Apache License

private static JobConf createJob(Configuration config, Path linkDb, boolean normalize, boolean filter) {
    Path newLinkDb = new Path("linkdb-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf job = new NutchJob(config);
    job.setJobName("linkdb " + linkDb);

    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(LinkDb.class);
    job.setCombinerClass(LinkDbMerger.class);
    // if we don't run the mergeJob, perform normalization/filtering now
    if (normalize || filter) {
        try {// w w  w  . j av a  2 s.  co  m
            FileSystem fs = FileSystem.get(config);
            if (!fs.exists(linkDb)) {
                job.setBoolean(LinkDbFilter.URL_FILTERING, filter);
                job.setBoolean(LinkDbFilter.URL_NORMALIZING, normalize);
            }
        } catch (Exception e) {
            LOG.warn("LinkDb createJob: " + e);
        }
    }
    job.setReducerClass(LinkDbMerger.class);

    FileOutputFormat.setOutputPath(job, newLinkDb);
    job.setOutputFormat(MapFileOutputFormat.class);
    job.setBoolean("mapred.output.compress", true);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Inlinks.class);

    return job;
}