Example usage for org.apache.hadoop.mapred JobConf setBoolean

List of usage examples for org.apache.hadoop.mapred JobConf setBoolean

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setBoolean.

Prototype

public void setBoolean(String name, boolean value) 

Source Link

Document

Set the value of the name property to a boolean.

Usage

From source file:ivory.core.preprocess.BuildWeightedTermDocVectors.java

License:Apache License

@SuppressWarnings("deprecation")
public int runTool() throws Exception {
    sLogger.info("PowerTool: GetWeightedTermDocVectors");

    JobConf conf = new JobConf(BuildWeightedTermDocVectors.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = getConf().get("Ivory.IndexPath");
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    String outputPath = env.getWeightedTermDocVectorsDirectory();
    int mapTasks = getConf().getInt("Ivory.NumMapTasks", 0);
    int minSplitSize = getConf().getInt("Ivory.MinSplitSize", 0);
    String collectionName = getConf().get("Ivory.CollectionName");

    String termsFilePath = env.getIndexTermsData();
    String termsIdsFilePath = env.getIndexTermIdsData();
    String termIdMappingFilePath = env.getIndexTermIdMappingData();
    String dfByTermFilePath = env.getDfByTermData();

    Path inputPath = new Path(env.getTermDocVectorsDirectory());
    Path weightedVectorsPath = new Path(outputPath);

    if (fs.exists(weightedVectorsPath)) {
        //fs.delete(weightedVectorsPath, true);
        sLogger.info("Output path already exists!");
        return 0;
    }//from w  ww . j  av a2 s . co  m

    /* add terms file to cache */
    if (!fs.exists(new Path(termsFilePath)) || !fs.exists(new Path(termsIdsFilePath))
            || !fs.exists(new Path(termIdMappingFilePath))) {
        throw new RuntimeException("Error, terms file " + termsFilePath + "/" + termsIdsFilePath + "/"
                + termIdMappingFilePath + "doesn't exist!");
    }
    DistributedCache.addCacheFile(new URI(termsFilePath), conf);
    DistributedCache.addCacheFile(new URI(termsIdsFilePath), conf);
    DistributedCache.addCacheFile(new URI(termIdMappingFilePath), conf);

    /* add df table to cache */
    if (!fs.exists(new Path(dfByTermFilePath))) {
        throw new RuntimeException("Error, df data file " + dfByTermFilePath + "doesn't exist!");
    }
    DistributedCache.addCacheFile(new URI(dfByTermFilePath), conf);

    /* add dl table to cache */
    Path docLengthFile = env.getDoclengthsData();
    if (!fs.exists(docLengthFile)) {
        throw new RuntimeException("Error, doc-length data file " + docLengthFile + "doesn't exist!");
    }
    DistributedCache.addCacheFile(docLengthFile.toUri(), conf);

    conf.setMapperClass(MyMapper.class);
    //conf.setInt("mapred.task.timeout",3600000);
    conf.setJobName("GetWeightedTermDocVectors:" + collectionName);
    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(0);
    conf.setInt("mapred.min.split.size", minSplitSize);
    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setInt("Ivory.MinNumTerms", getConf().getInt("Ivory.MinNumTerms", Integer.MAX_VALUE));
    conf.setBoolean("Ivory.Normalize", getConf().getBoolean("Ivory.Normalize", false));
    if (getConf().get("Ivory.ShortDocLengths") != null) {
        conf.set("Ivory.ShortDocLengths", getConf().get("Ivory.ShortDocLengths"));
    }
    conf.set("Ivory.ScoringModel", getConf().get("Ivory.ScoringModel"));

    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, weightedVectorsPath);
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(HMapSFW.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(HMapSFW.class);

    sLogger.info("Running job: " + conf.getJobName());

    long startTime = System.currentTimeMillis();
    RunningJob job = JobClient.runJob(conf);
    sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}

From source file:net.darkseraphim.webanalytics.hadoop.csv.CSVTextInputFormat.java

License:Apache License

@Override
public RecordReader<LongWritable, List<Text>> getRecordReader(InputSplit split, JobConf conf, Reporter reporter)
        throws IOException {
    String quote = conf.get(CSVLineRecordReader.FORMAT_DELIMITER);
    String separator = conf.get(CSVLineRecordReader.FORMAT_SEPARATOR);
    conf.set(CSVLineRecordReader.FORMAT_DELIMITER, CSVLineRecordReader.DEFAULT_DELIMITER);
    conf.set(CSVLineRecordReader.FORMAT_SEPARATOR, CSVLineRecordReader.DEFAULT_SEPARATOR);
    conf.setBoolean(CSVLineRecordReader.IS_ZIPFILE, false);
    System.out.println("[LOG] Created reader");
    if (split instanceof FileSplit) {
        return reader = new CSVLineRecordReader(split, conf);
    }/*  w ww  . j  a va 2 s . c o m*/
    throw new UnsupportedOperationException("Only FileSplits are supported");
}

From source file:net.peacesoft.nutch.crawl.RaovatIndexer.java

License:Apache License

public void indexSolr(String solrUrl, Path inputDb, Path crawlDb, Path linkDb, List<Path> segments,
        boolean noCommit, boolean deleteGone, String solrParams, boolean filter, boolean normalize)
        throws IOException {

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("RaovatIndexer: starting at " + sdf.format(start));

    final JobConf job = new NutchJob(getConf());
    job.setJobName("index-solr " + solrUrl);

    LOG.info("RaovatIndexer: deleting gone documents: " + deleteGone);
    LOG.info("RaovatIndexer: URL filtering: " + filter);
    LOG.info("RaovatIndexer: URL normalizing: " + normalize);

    ReIndexerMapReduce.initMRJob(inputDb, crawlDb, linkDb, segments, job);

    job.set(ReSolrConstants.SERVER_URL, solrUrl);
    job.setBoolean(IndexerMapReduce.INDEXER_DELETE, deleteGone);
    job.setBoolean(IndexerMapReduce.URL_FILTERING, filter);
    job.setBoolean(IndexerMapReduce.URL_NORMALIZING, normalize);
    if (solrParams != null) {
        job.set(ReSolrConstants.PARAMS, solrParams);
    }/*from w w  w.ja v a2s.c om*/
    NutchIndexWriterFactory.addClassToConf(job, RaovatPoster.class);

    job.setReduceSpeculativeExecution(false);

    final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-" + new Random().nextInt());

    FileOutputFormat.setOutputPath(job, tmp);
    try {
        JobClient.runJob(job);
        // do the commits once and for all the reducers in one go
        //            SolrServer solr = SolrUtils.getCommonsHttpSolrServer(job);
        //
        //            if (!noCommit) {
        //                solr.commit();
        //            }
        long end = System.currentTimeMillis();
        LOG.info("RaovatIndexer: finished at " + sdf.format(end) + ", elapsed: "
                + TimingUtil.elapsedTime(start, end));
    } catch (Exception e) {
        LOG.error("RaovatIndexer:" + e.toString());
    } finally {
        FileSystem.get(job).delete(tmp, true);
    }
}

From source file:net.peacesoft.nutch.crawl.RaovatIndexer.java

License:Apache License

public void indexSolr(String solrUrl, Path crawlDb, Path linkDb, List<Path> segments, boolean noCommit,
        boolean deleteGone, String solrParams, boolean filter, boolean normalize) throws IOException {

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("RaovatIndexer: starting at " + sdf.format(start));

    final JobConf job = new NutchJob(getConf());
    job.setJobName("index-solr " + solrUrl);

    LOG.info("RaovatIndexer: deleting gone documents: " + deleteGone);
    LOG.info("RaovatIndexer: URL filtering: " + filter);
    LOG.info("RaovatIndexer: URL normalizing: " + normalize);

    IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job);

    job.set(ReSolrConstants.SERVER_URL, solrUrl);
    job.setBoolean(IndexerMapReduce.INDEXER_DELETE, deleteGone);
    job.setBoolean(IndexerMapReduce.URL_FILTERING, filter);
    job.setBoolean(IndexerMapReduce.URL_NORMALIZING, normalize);
    if (solrParams != null) {
        job.set(ReSolrConstants.PARAMS, solrParams);
    }// ww w .j av  a  2  s  . c om
    NutchIndexWriterFactory.addClassToConf(job, RaovatPoster.class);

    job.setReduceSpeculativeExecution(false);

    final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-" + new Random().nextInt());

    FileOutputFormat.setOutputPath(job, tmp);
    try {
        JobClient.runJob(job);
        // do the commits once and for all the reducers in one go
        //            SolrServer solr = SolrUtils.getCommonsHttpSolrServer(job);
        //
        //            if (!noCommit) {
        //                solr.commit();
        //            }
        long end = System.currentTimeMillis();
        LOG.info("RaovatIndexer: finished at " + sdf.format(end) + ", elapsed: "
                + TimingUtil.elapsedTime(start, end));
    } catch (Exception e) {
        LOG.error("RaovatIndexer: " + e.toString());
    } finally {
        FileSystem.get(job).delete(tmp, true);
    }
}

From source file:net.peacesoft.nutch.crawl.RaovatPostDeleteDuplicates.java

License:Apache License

public void dedup(String solrUrl, boolean noCommit) throws IOException {
    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("RaovatPostDeleteDuplicates: starting at " + sdf.format(start));
    LOG.info("RaovatPostDeleteDuplicates: Solr url: " + solrUrl);

    JobConf job = new NutchJob(getConf());

    job.set(ReSolrConstants.SERVER_URL, solrUrl);
    job.setBoolean("noCommit", noCommit);
    job.setInputFormat(RaovatPostDeleteDuplicates.SolrInputFormat.class);
    job.setOutputFormat(NullOutputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(RaovatPostDeleteDuplicates.SolrRecord.class);
    job.setMapperClass(IdentityMapper.class);
    job.setReducerClass(RaovatPostDeleteDuplicates.class);

    JobClient.runJob(job);/*from  ww  w .  j a v a  2s.  c  om*/

    long end = System.currentTimeMillis();
    LOG.info("RaovatPostDeleteDuplicates: finished at " + sdf.format(end) + ", elapsed: "
            + TimingUtil.elapsedTime(start, end));
}

From source file:net.peacesoft.nutch.crawl.ReCrawlDb.java

License:Apache License

public void update(Path crawlDb, Path[] segments, boolean normalize, boolean filter, boolean additionsAllowed,
        boolean force) throws IOException {
    try {/*ww  w  . ja  va 2 s. co m*/
        FileSystem fs = FileSystem.get(getConf());
        Path lock = new Path(crawlDb, LOCK_NAME);
        try {
            LockUtil.createLockFile(fs, lock, force);
        } catch (Exception ex) {
        }
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long start = System.currentTimeMillis();

        JobConf job = ReCrawlDb.createJob(getConf(), crawlDb);
        job.setBoolean(CRAWLDB_ADDITIONS_ALLOWED, additionsAllowed);
        job.setBoolean(CrawlDbFilter.URL_FILTERING, filter);
        job.setBoolean(CrawlDbFilter.URL_NORMALIZING, normalize);

        boolean url404Purging = job.getBoolean(CRAWLDB_PURGE_404, false);

        if (LOG.isInfoEnabled()) {
            LOG.info("CrawlDb update: starting at " + sdf.format(start));
            LOG.info("CrawlDb update: db: " + crawlDb);
            LOG.info("CrawlDb update: segments: " + Arrays.asList(segments));
            LOG.info("CrawlDb update: additions allowed: " + additionsAllowed);
            LOG.info("CrawlDb update: URL normalizing: " + normalize);
            LOG.info("CrawlDb update: URL filtering: " + filter);
            LOG.info("CrawlDb update: 404 purging: " + url404Purging);
        }

        for (int i = 0; i < segments.length; i++) {
            Path fetch = new Path(segments[i], CrawlDatum.FETCH_DIR_NAME);
            Path parse = new Path(segments[i], CrawlDatum.PARSE_DIR_NAME);
            if (fs.exists(fetch) && fs.exists(parse)) {
                FileInputFormat.addInputPath(job, fetch);
                FileInputFormat.addInputPath(job, parse);
            } else {
                LOG.info(" - skipping invalid segment " + segments[i]);
            }
        }

        if (LOG.isInfoEnabled()) {
            LOG.info("ReCrawlDb update: Merging segment data into db.");
        }
        try {
            JobClient.runJob(job);
        } catch (IOException e) {
            LockUtil.removeLockFile(fs, lock);
            Path outPath = FileOutputFormat.getOutputPath(job);
            if (fs.exists(outPath)) {
                fs.delete(outPath, true);
            }
            throw e;
        }

        ReCrawlDb.install(job, crawlDb);
        long end = System.currentTimeMillis();
        LOG.info("CrawlDb update: finished at " + sdf.format(end) + ", elapsed: "
                + TimingUtil.elapsedTime(start, end));
    } catch (Exception ex) {
        LOG.error("ReCrawlDb update error: " + ex.toString(), ex);
    }
}

From source file:net.peacesoft.nutch.crawl.ReCrawlDb.java

License:Apache License

public static JobConf createJob(Configuration config, Path crawlDb) throws IOException {
    Path newCrawlDb = new Path(crawlDb, Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf job = new NutchJob(config);
    job.setJobName("crawldb " + crawlDb);

    Path current = new Path(crawlDb, CURRENT_NAME);
    if (FileSystem.get(job).exists(current)) {
        FileInputFormat.addInputPath(job, current);
    }/*from  w ww. j  av  a  2 s. com*/
    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(CrawlDbFilter.class);
    job.setReducerClass(CrawlDbReducer.class);

    FileOutputFormat.setOutputPath(job, newCrawlDb);
    job.setOutputFormat(MapFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);

    // https://issues.apache.org/jira/browse/NUTCH-1110
    job.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    return job;
}

From source file:net.peacesoft.nutch.crawl.ReGenerator.java

License:Apache License

/**
 * Generate fetchlists in one or more segments. Whether to filter URLs or
 * not is read from the crawl.generate.filter property in the configuration
 * files. If the property is not found, the URLs are filtered. Same for the
 * normalisation.//from   ww  w  .j  a va2 s  .c  o  m
 *
 * @param dbDir Crawl database directory
 * @param segments Segments directory
 * @param numLists Number of reduce tasks
 * @param topN Number of top URLs to be selected
 * @param curTime Current time in milliseconds
 *
 * @return Path to generated segment or null if no entries were selected
 *
 * @throws IOException When an I/O error occurs
 */
public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter,
        boolean norm, boolean force, int maxNumSegments) throws IOException {
    try {
        Path tempDir = new Path(
                getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

        Path lock = new Path(dbDir, CrawlDb.LOCK_NAME);
        FileSystem fs = FileSystem.get(getConf());
        LockUtil.createLockFile(fs, lock, force);

        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long start = System.currentTimeMillis();
        LOG.info("ReGenerator: starting at " + sdf.format(start));
        LOG.info("ReGenerator: Selecting best-scoring urls due for fetch.");
        LOG.info("ReGenerator: filtering: " + filter);
        LOG.info("ReGenerator: normalizing: " + norm);
        if (topN != Long.MAX_VALUE) {
            LOG.info("ReGenerator: topN: " + topN);
        }

        if ("true".equals(getConf().get(GENERATE_MAX_PER_HOST_BY_IP))) {
            LOG.info(
                    "ReGenerator: GENERATE_MAX_PER_HOST_BY_IP will be ignored, use partition.url.mode instead");
        }

        // map to inverted subset due for fetch, sort by score
        JobConf job = new NutchJob(getConf());
        job.setJobName("generate: select from " + dbDir);

        if (numLists == -1) { // for politeness make
            numLists = job.getNumMapTasks(); // a partition per fetch task
        }
        if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) {
            // override
            LOG.info("ReGenerator: jobtracker is 'local', generating exactly one partition.");
            numLists = 1;
        }
        job.setLong(GENERATOR_CUR_TIME, curTime);
        // record real generation time
        long generateTime = System.currentTimeMillis();
        job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
        job.setLong(GENERATOR_TOP_N, topN);
        job.setBoolean(GENERATOR_FILTER, filter);
        job.setBoolean(GENERATOR_NORMALISE, norm);
        job.setInt(GENERATOR_MAX_NUM_SEGMENTS, maxNumSegments);

        FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
        job.setInputFormat(SequenceFileInputFormat.class);

        job.setMapperClass(Selector.class);
        job.setPartitionerClass(Selector.class);
        job.setReducerClass(Selector.class);

        FileOutputFormat.setOutputPath(job, tempDir);
        job.setOutputFormat(SequenceFileOutputFormat.class);
        job.setOutputKeyClass(FloatWritable.class);
        job.setOutputKeyComparatorClass(DecreasingFloatComparator.class);
        job.setOutputValueClass(SelectorEntry.class);
        job.setOutputFormat(GeneratorOutputFormat.class);

        try {
            JobClient.runJob(job);
        } catch (IOException e) {
            throw e;
        }

        // read the subdirectories generated in the temp
        // output and turn them into segments
        List<Path> generatedSegments = new ArrayList<Path>();

        FileStatus[] status = fs.listStatus(tempDir);
        try {
            for (FileStatus stat : status) {
                Path subfetchlist = stat.getPath();
                if (!subfetchlist.getName().startsWith("fetchlist-")) {
                    continue;
                }
                // start a new partition job for this segment
                Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists);
                generatedSegments.add(newSeg);
            }
        } catch (Exception e) {
            LOG.warn("ReGenerator: exception while partitioning segments, exiting ...");
            fs.delete(tempDir, true);
            return null;
        }

        if (generatedSegments.size() == 0) {
            LOG.warn("ReGenerator: 0 records selected for fetching, exiting ...");
            LockUtil.removeLockFile(fs, lock);
            fs.delete(tempDir, true);
            return null;
        }

        if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) {
            // update the db from tempDir
            Path tempDir2 = new Path(
                    getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

            job = new NutchJob(getConf());
            job.setJobName("generate: updatedb " + dbDir);
            job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
            for (Path segmpaths : generatedSegments) {
                Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME);
                FileInputFormat.addInputPath(job, subGenDir);
            }
            FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
            job.setInputFormat(SequenceFileInputFormat.class);
            job.setMapperClass(CrawlDbUpdater.class);
            job.setReducerClass(CrawlDbUpdater.class);
            job.setOutputFormat(MapFileOutputFormat.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(CrawlDatum.class);
            FileOutputFormat.setOutputPath(job, tempDir2);
            try {
                JobClient.runJob(job);
                CrawlDb.install(job, dbDir);
            } catch (IOException e) {
                LockUtil.removeLockFile(fs, lock);
                fs.delete(tempDir, true);
                fs.delete(tempDir2, true);
                throw e;
            }
            fs.delete(tempDir2, true);
        }

        LockUtil.removeLockFile(fs, lock);
        fs.delete(tempDir, true);

        long end = System.currentTimeMillis();
        LOG.info("ReGenerator: finished at " + sdf.format(end) + ", elapsed: "
                + TimingUtil.elapsedTime(start, end));

        Path[] patharray = new Path[generatedSegments.size()];
        return generatedSegments.toArray(patharray);
    } catch (Exception ex) {
        LOG.error("ReGenerator generate error: " + ex.toString(), ex);
        return null;
    }
}

From source file:net.peacesoft.nutch.crawl.ReLinkDb.java

License:Apache License

private static JobConf createJob(Configuration config, Path linkDb, boolean normalize, boolean filter) {
    Path newLinkDb = new Path("linkdb-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf job = new NutchJob(config);
    job.setJobName("linkdb " + linkDb);

    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(ReLinkDb.class);
    job.setCombinerClass(LinkDbMerger.class);
    // if we don't run the mergeJob, perform normalization/filtering now
    if (normalize || filter) {
        try {/*www.j  a  v  a2s . co m*/
            FileSystem fs = FileSystem.get(config);
            if (!fs.exists(linkDb)) {
                job.setBoolean(LinkDbFilter.URL_FILTERING, filter);
                job.setBoolean(LinkDbFilter.URL_NORMALIZING, normalize);
            }
        } catch (Exception e) {
            LOG.warn("ReLinkDb createJob: " + e);
        }
    }
    job.setReducerClass(LinkDbMerger.class);

    FileOutputFormat.setOutputPath(job, newLinkDb);
    job.setOutputFormat(MapFileOutputFormat.class);
    job.setBoolean("mapred.output.compress", true);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Inlinks.class);

    return job;
}

From source file:net.peacesoft.nutch.crawl.ReSolrIndexer.java

License:Apache License

public void indexSolr(String solrUrl, Path inputDb, Path crawlDb, Path linkDb, List<Path> segments,
        boolean noCommit, boolean deleteGone, String solrParams, boolean filter, boolean normalize)
        throws IOException {

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("SolrIndexer: starting at " + sdf.format(start));

    final JobConf job = new NutchJob(getConf());
    job.setJobName("index-solr " + solrUrl);

    LOG.info("SolrIndexer: deleting gone documents: " + deleteGone);
    LOG.info("SolrIndexer: URL filtering: " + filter);
    LOG.info("SolrIndexer: URL normalizing: " + normalize);

    ReIndexerMapReduce.initMRJob(inputDb, crawlDb, linkDb, segments, job);

    job.set(ReSolrConstants.SERVER_URL, solrUrl);
    job.setBoolean(IndexerMapReduce.INDEXER_DELETE, deleteGone);
    job.setBoolean(IndexerMapReduce.URL_FILTERING, filter);
    job.setBoolean(IndexerMapReduce.URL_NORMALIZING, normalize);
    if (solrParams != null) {
        job.set(ReSolrConstants.PARAMS, solrParams);
    }//from   www. j  av a  2 s. c o  m
    NutchIndexWriterFactory.addClassToConf(job, ReSolrWriter.class);

    job.setReduceSpeculativeExecution(false);

    final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-" + new Random().nextInt());

    FileOutputFormat.setOutputPath(job, tmp);
    try {
        JobClient.runJob(job);
        // do the commits once and for all the reducers in one go
        SolrServer solr = SolrUtils.getCommonsHttpSolrServer(job);

        if (!noCommit) {
            solr.commit();
        }
        long end = System.currentTimeMillis();
        LOG.info("SolrIndexer: finished at " + sdf.format(end) + ", elapsed: "
                + TimingUtil.elapsedTime(start, end));
    } catch (Exception e) {
        LOG.error(e.toString());
    } finally {
        FileSystem.get(job).delete(tmp, true);
    }
}