Example usage for org.apache.hadoop.mapred JobConf setReducerClass

List of usage examples for org.apache.hadoop.mapred JobConf setReducerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setReducerClass.

Prototype

public void setReducerClass(Class<? extends Reducer> theClass) 

Source Link

Document

Set the Reducer class for the job.

Usage

From source file:com.ricemap.spateDB.operations.Plot.java

License:Apache License

public static <S extends Shape> void plotMapReduce(Path inFile, Path outFile, Shape shape, int width,
        int height, Color color, boolean showBorders, boolean showBlockCount, boolean showRecordCount,
        boolean background) throws IOException {
    JobConf job = new JobConf(Plot.class);
    job.setJobName("Plot");

    job.setMapperClass(PlotMap.class);
    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5);
    job.setReducerClass(PlotReduce.class);
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks()));
    job.setMapOutputKeyClass(Prism.class);
    SpatialSite.setShapeClass(job, shape.getClass());
    job.setMapOutputValueClass(shape.getClass());

    FileSystem inFs = inFile.getFileSystem(job);
    Prism fileMbr = FileMBR.fileMBRMapReduce(inFs, inFile, shape, false);
    FileStatus inFileStatus = inFs.getFileStatus(inFile);

    CellInfo[] cellInfos;/*from  w  ww . j a v  a 2 s . c  o m*/
    GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(inFs, inFile);
    if (gindex == null) {
        // A heap file. The map function should partition the file
        GridInfo gridInfo = new GridInfo(fileMbr.t1, fileMbr.x1, fileMbr.y1, fileMbr.t2, fileMbr.x2,
                fileMbr.y2);
        gridInfo.calculateCellDimensions(inFileStatus.getLen(), inFileStatus.getBlockSize());
        cellInfos = gridInfo.getAllCells();
        // Doesn't make sense to show any partition information in a heap file
        showBorders = showBlockCount = showRecordCount = false;
    } else {
        cellInfos = SpatialSite.cellsOf(inFs, inFile);
    }

    // Set cell information in the job configuration to be used by the mapper
    SpatialSite.setCells(job, cellInfos);

    // Adjust width and height to maintain aspect ratio
    if ((fileMbr.x2 - fileMbr.x1) / (fileMbr.y2 - fileMbr.y1) > (double) width / height) {
        // Fix width and change height
        height = (int) ((fileMbr.y2 - fileMbr.y1) * width / (fileMbr.x2 - fileMbr.x1));
    } else {
        width = (int) ((fileMbr.x2 - fileMbr.x1) * height / (fileMbr.y2 - fileMbr.y1));
    }
    LOG.info("Creating an image of size " + width + "x" + height);
    ImageOutputFormat.setFileMBR(job, fileMbr);
    ImageOutputFormat.setImageWidth(job, width);
    ImageOutputFormat.setImageHeight(job, height);
    job.setBoolean(ShowBorders, showBorders);
    job.setBoolean(ShowBlockCount, showBlockCount);
    job.setBoolean(ShowRecordCount, showRecordCount);
    job.setInt(StrokeColor, color.getRGB());

    // Set input and output
    job.setInputFormat(ShapeInputFormat.class);
    ShapeInputFormat.addInputPath(job, inFile);
    // Set output committer which will stitch images together after all reducers
    // finish
    job.setOutputCommitter(PlotOutputCommitter.class);

    job.setOutputFormat(ImageOutputFormat.class);
    TextOutputFormat.setOutputPath(job, outFile);

    if (background) {
        JobClient jc = new JobClient(job);
        lastSubmittedJob = jc.submitJob(job);
    } else {
        lastSubmittedJob = JobClient.runJob(job);
    }
}

From source file:com.ricemap.spateDB.operations.RangeQuery.java

License:Apache License

/**
 * Performs a range query using MapReduce
 * //  w w  w  .  j  a va 2 s .  c o m
 * @param fs
 * @param inputFile
 * @param queryRange
 * @param shape
 * @param output
 * @return
 * @throws IOException
 */
public static long rangeQueryMapReduce(FileSystem fs, Path inputFile, Path userOutputPath, Shape queryShape,
        Shape shape, boolean overwrite, boolean background, QueryInput query) throws IOException {
    JobConf job = new JobConf(FileMBR.class);

    FileSystem outFs = inputFile.getFileSystem(job);
    Path outputPath = userOutputPath;
    if (outputPath == null) {
        do {
            outputPath = new Path(
                    inputFile.toUri().getPath() + ".rangequery_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outputPath));
    } else {
        if (outFs.exists(outputPath)) {
            if (overwrite) {
                outFs.delete(outputPath, true);
            } else {
                throw new RuntimeException("Output path already exists and -overwrite flag is not set");
            }
        }
    }

    job.setJobName("RangeQuery");
    job.setClass(SpatialSite.FilterClass, RangeFilter.class, BlockFilter.class);
    RangeFilter.setQueryRange(job, queryShape); // Set query range for
    // filter

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5);
    job.setNumReduceTasks(3);

    // Decide which map function to use depending on how blocks are indexed
    // And also which input format to use
    if (SpatialSite.isRTree(fs, inputFile)) {
        // RTree indexed file
        LOG.info("Searching an RTree indexed file");
        job.setInputFormat(RTreeInputFormat.class);
    } else {
        // A file with no local index
        LOG.info("Searching a non local-indexed file");
        job.setInputFormat(ShapeInputFormat.class);
    }

    GlobalIndex<Partition> gIndex = SpatialSite.getGlobalIndex(fs, inputFile);
    // if (gIndex != null && gIndex.isReplicated()){
    // job.setMapperClass(RangeQueryMap.class);

    Class<?> OutputKey = NullWritable.class;
    try {
        Class<?> c = shape.getClass();
        Field f = c.getDeclaredField(query.field);
        f.setAccessible(true);
        if (f.getType().equals(Integer.TYPE)) {
            OutputKey = IntWritable.class;
        } else if (f.getType().equals(Double.TYPE)) {
            OutputKey = DoubleWritable.class;
        } else if (f.getType().equals(Long.TYPE)) {
            OutputKey = LongWritable.class;
        }
    } catch (SecurityException e) {
        e.printStackTrace();
    } catch (NoSuchFieldException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

    job.setMapOutputKeyClass(OutputKey);
    switch (query.type) {
    case Distinct:
        job.setMapperClass(DistinctQueryMap.class);
        job.setReducerClass(DistinctQueryReduce.class);
        job.setMapOutputValueClass(NullWritable.class);
        break;
    case Distribution:
        job.setMapperClass(DistributionQueryMap.class);
        job.setReducerClass(DistributionQueryReduce.class);
        job.setMapOutputValueClass(IntWritable.class);
        break;
    default:
        break;
    }
    // }
    // else
    // job.setMapperClass(RangeQueryMapNoDupAvoidance.class);

    // Set query range for the map function
    job.set(QUERY_SHAPE_CLASS, queryShape.getClass().getName());
    job.set(QUERY_SHAPE, queryShape.toText(new Text()).toString());
    job.set(QUERY_FIELD, query.field);

    // Set shape class for the SpatialInputFormat
    SpatialSite.setShapeClass(job, shape.getClass());

    job.setOutputFormat(TextOutputFormat.class);

    ShapeInputFormat.setInputPaths(job, inputFile);
    TextOutputFormat.setOutputPath(job, outputPath);

    // Submit the job
    if (!background) {
        RunningJob runningJob = JobClient.runJob(job);
        Counters counters = runningJob.getCounters();
        Counter outputRecordCounter = counters.findCounter(Task.Counter.MAP_OUTPUT_RECORDS);
        final long resultCount = outputRecordCounter.getValue();

        // If outputPath not set by user, automatically delete it
        if (userOutputPath == null)
            outFs.delete(outputPath, true);

        return resultCount;
    } else {
        JobClient jc = new JobClient(job);
        lastRunningJob = jc.submitJob(job);
        return -1;
    }
}

From source file:com.ricemap.spateDB.operations.RecordCount.java

License:Apache License

/**
 * Counts the exact number of lines in a file by issuing a MapReduce job
 * that does the thing/*from w w  w. j  av a  2s  . c  o  m*/
 * @param conf
 * @param fs
 * @param file
 * @return
 * @throws IOException 
 */
public static long recordCountMapReduce(FileSystem fs, Path file) throws IOException {
    JobConf job = new JobConf(RecordCount.class);

    Path outputPath = new Path(file.toUri().getPath() + ".linecount");
    FileSystem outFs = outputPath.getFileSystem(job);
    outFs.delete(outputPath, true);

    job.setJobName("LineCount");
    job.setMapOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(LongWritable.class);

    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);
    job.setCombinerClass(Reduce.class);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5);
    job.setNumReduceTasks(1);

    job.setInputFormat(ShapeLineInputFormat.class);
    job.setOutputFormat(TextOutputFormat.class);

    ShapeLineInputFormat.setInputPaths(job, file);
    TextOutputFormat.setOutputPath(job, outputPath);

    // Submit the job
    JobClient.runJob(job);

    // Read job result
    long lineCount = 0;
    FileStatus[] results = outFs.listStatus(outputPath);
    for (FileStatus fileStatus : results) {
        if (fileStatus.getLen() > 0 && fileStatus.getPath().getName().startsWith("part-")) {
            LineReader lineReader = new LineReader(outFs.open(fileStatus.getPath()));
            Text text = new Text();
            if (lineReader.readLine(text) > 0) {
                lineCount = Long.parseLong(text.toString());
            }
            lineReader.close();
        }
    }

    outFs.delete(outputPath, true);

    return lineCount;
}

From source file:com.ricemap.spateDB.operations.Repartition.java

License:Apache License

/**
 * Repartitions an input file according to the given list of cells.
 * @param inFile/*from  w  ww. j a  v  a2 s.c  o  m*/
 * @param outPath
 * @param cellInfos
 * @param pack
 * @param rtree
 * @param overwrite
 * @throws IOException
 */
public static void repartitionMapReduce(Path inFile, Path outPath, Shape stockShape, long blockSize,
        CellInfo[] cellInfos, String sindex, boolean overwrite, boolean columnar) throws IOException {
    JobConf job = new JobConf(Repartition.class);
    job.setJobName("Repartition");
    FileSystem outFs = outPath.getFileSystem(job);

    // Overwrite output file
    if (outFs.exists(outPath)) {
        if (overwrite)
            outFs.delete(outPath, true);
        else
            throw new RuntimeException(
                    "Output file '" + outPath + "' already exists and overwrite flag is not set");
    }

    // Decide which map function to use depending on the type of global index
    if (sindex.equals("rtree")) {
        // Repartition without replication
        job.setMapperClass(RepartitionMapNoReplication.class);
    } else {
        // Repartition with replication (grid and r+tree)
        job.setMapperClass(RepartitionMap.class);
    }
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(stockShape.getClass());
    ShapeInputFormat.setInputPaths(job, inFile);
    job.setInputFormat(ShapeInputFormat.class);
    boolean pack = sindex.equals("r+tree");
    boolean expand = sindex.equals("rtree");
    job.setBoolean(SpatialSite.PACK_CELLS, pack);
    job.setBoolean(SpatialSite.EXPAND_CELLS, expand);
    job.setStrings(SpatialSite.STORAGE_MODE, columnar ? "columnar" : "normal");

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(10 * Math.max(1, clusterStatus.getMaxMapTasks()));

    // Set default parameters for reading input file
    SpatialSite.setShapeClass(job, stockShape.getClass());

    FileOutputFormat.setOutputPath(job, outPath);
    if (sindex.equals("grid")) {
        job.setOutputFormat(GridOutputFormat.class);
    } else if (sindex.equals("rtree") || sindex.equals("r+tree")) {
        // For now, the two types of local index are the same
        job.setOutputFormat(RTreeGridOutputFormat.class);
    } else {
        throw new RuntimeException("Unsupported spatial index: " + sindex);
    }
    // Copy block size from source file if it's globally indexed
    FileSystem inFs = inFile.getFileSystem(job);

    if (blockSize == 0) {
        GlobalIndex<Partition> globalIndex = SpatialSite.getGlobalIndex(inFs, inFile);
        if (globalIndex != null) {
            blockSize = inFs.getFileStatus(new Path(inFile, globalIndex.iterator().next().filename))
                    .getBlockSize();
            LOG.info("Automatically setting block size to " + blockSize);
        }
    }

    if (blockSize != 0)
        job.setLong(SpatialSite.LOCAL_INDEX_BLOCK_SIZE, blockSize);
    SpatialSite.setCells(job, cellInfos);
    job.setBoolean(SpatialSite.OVERWRITE, overwrite);

    // Set reduce function
    job.setReducerClass(RepartitionReduce.class);
    job.setNumReduceTasks(
            Math.max(1, Math.min(cellInfos.length, (clusterStatus.getMaxReduceTasks() * 9 + 5) / 10)));

    // Set output committer that combines output files together
    job.setOutputCommitter(RepartitionOutputCommitter.class);

    JobClient.runJob(job);
}

From source file:com.ricemap.spateDB.util.RandomSpatialGenerator.java

License:Apache License

public static void generateMapReduce(Path file, Prism mbr, long size, long blocksize, Shape shape,
        String sindex, long seed, int rectsize, RandomShapeGenerator.DistributionType type, boolean overwrite)
        throws IOException {
    JobConf job = new JobConf(RandomSpatialGenerator.class);

    job.setJobName("Generator");
    FileSystem outFs = file.getFileSystem(job);

    // Overwrite output file
    if (outFs.exists(file)) {
        if (overwrite)
            outFs.delete(file, true);/*from  ww w  .j  a va2 s.  c  o  m*/
        else
            throw new RuntimeException(
                    "Output file '" + file + "' already exists and overwrite flag is not set");
    }

    // Set generation parameters in job
    job.setLong(RandomShapeGenerator.GenerationSize, size);
    SpatialSite.setPrism(job, RandomShapeGenerator.GenerationMBR, mbr);
    if (seed != 0)
        job.setLong(RandomShapeGenerator.GenerationSeed, seed);
    if (rectsize != 0)
        job.setInt(RandomShapeGenerator.GenerationRectSize, rectsize);
    if (type != null)
        job.set(RandomShapeGenerator.GenerationType, type.toString());

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    // Set input format and map class
    job.setInputFormat(RandomInputFormat.class);
    job.setMapperClass(Repartition.RepartitionMap.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(shape.getClass());
    job.setNumMapTasks(10 * Math.max(1, clusterStatus.getMaxMapTasks()));

    SpatialSite.setShapeClass(job, shape.getClass());

    if (blocksize != 0) {
        job.setLong(SpatialSite.LOCAL_INDEX_BLOCK_SIZE, blocksize);
    }

    CellInfo[] cells;
    if (sindex == null) {
        cells = new CellInfo[] { new CellInfo(1, mbr) };
    } else if (sindex.equals("grid")) {
        GridInfo gridInfo = new GridInfo(mbr.t1, mbr.x1, mbr.y1, mbr.t2, mbr.x2, mbr.y2);
        FileSystem fs = file.getFileSystem(job);
        if (blocksize == 0) {
            blocksize = fs.getDefaultBlockSize(file);
        }
        int numOfCells = Repartition.calculateNumberOfPartitions(job, size, fs, file, blocksize);
        gridInfo.calculateCellDimensions(numOfCells);
        cells = gridInfo.getAllCells();
    } else {
        throw new RuntimeException("Unsupported spatial index: " + sindex);
    }

    SpatialSite.setCells(job, cells);

    // Do not set a reduce function. Use the default identity reduce function
    if (cells.length == 1) {
        // All objects are in one partition. No need for a reduce phase
        job.setNumReduceTasks(0);
    } else {
        // More than one partition. Need a reduce phase to group shapes of the
        // same partition together
        job.setReducerClass(RepartitionReduce.class);
        job.setNumReduceTasks(
                Math.max(1, Math.min(cells.length, (clusterStatus.getMaxReduceTasks() * 9 + 5) / 10)));
    }

    // Set output path
    FileOutputFormat.setOutputPath(job, file);
    if (sindex == null || sindex.equals("grid")) {
        job.setOutputFormat(GridOutputFormat.class);
    } else {
        throw new RuntimeException("Unsupported spatial index: " + sindex);
    }

    JobClient.runJob(job);

    // Concatenate all master files into one file
    FileStatus[] resultFiles = outFs.listStatus(file, new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return path.getName().contains("_master");
        }
    });
    String ext = resultFiles[0].getPath().getName()
            .substring(resultFiles[0].getPath().getName().lastIndexOf('.'));
    Path masterPath = new Path(file, "_master" + ext);
    OutputStream destOut = outFs.create(masterPath);
    byte[] buffer = new byte[4096];
    for (FileStatus f : resultFiles) {
        InputStream in = outFs.open(f.getPath());
        int bytes_read;
        do {
            bytes_read = in.read(buffer);
            if (bytes_read > 0)
                destOut.write(buffer, 0, bytes_read);
        } while (bytes_read > 0);
        in.close();
        outFs.delete(f.getPath(), false);
    }
    destOut.close();

    // Plot an image for the partitions used in file
    Path imagePath = new Path(file, "_partitions.png");
    int imageSize = (int) (Math.sqrt(cells.length) * 300);
    Plot.plotLocal(masterPath, imagePath, new Partition(), imageSize, imageSize, Color.BLACK, false, false,
            false);
}

From source file:com.scaleoutsoftware.soss.hserver.Test_WordCountMapred.java

License:Apache License

/**
 * The main driver for word count map/reduce program.
 * Invoke this method to submit the map/reduce job.
 * @throws IOException When there is communication problems with the
 *                     job tracker./*from ww  w .  j  a v a2s  .  c  om*/
 */
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), Test_WordCountMapred.class);
    conf.setJobName("wordcount");

    // the keys are words (strings)
    conf.setOutputKeyClass(Text.class);
    // the values are counts (ints)
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(MapClass.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);
    conf.setNumReduceTasks(0);

    String in = args.length == 2 ? args[0] : "random.txt";
    String out = args.length == 2 ? args[1]
            : "c:\\development\\mapred_output\\dir" + System.currentTimeMillis();

    FileInputFormat.setInputPaths(conf, new Path(in));
    FileOutputFormat.setOutputPath(conf, new Path(out));

    InvocationGrid grid = HServerJob.getInvocationGridBuilder("MyGrid" + System.currentTimeMillis())
            .addJar("/path/to/your/jar").load();

    // HERE IS STANDARD HADOOP INVOCATION
    //JobClient.runJob(conf);

    // HSERVER INVOCATION
    HServerJobClient.runJob(conf, false, grid);
    return 0;
}

From source file:com.spotify.hdfs2cass.BulkLoader.java

License:Apache License

public int run(String[] args) throws Exception {
    CommandLine cmdLine = parseOptions(args);

    String[] inputPaths = cmdLine.getOptionValues('i');
    String seedNodeHost = cmdLine.getOptionValue('h');
    String seedNodePort = cmdLine.getOptionValue('p', "9160");
    String keyspace = cmdLine.getOptionValue('k');
    String colfamily = cmdLine.getOptionValue('c');
    int mappers = Integer.parseInt(cmdLine.getOptionValue('m', "0"));
    Integer copiers = Integer.parseInt(cmdLine.getOptionValue('P', "0"));
    String poolName = cmdLine.getOptionValue("pool");

    ClusterInfo clusterInfo = new ClusterInfo(seedNodeHost, seedNodePort);
    clusterInfo.init(keyspace);/*  ww w  .  ja  v a 2 s  . c om*/

    final String partitionerClass = clusterInfo.getPartitionerClass();
    final int reducers = adjustReducers(Integer.parseInt(cmdLine.getOptionValue('r', "0")),
            clusterInfo.getNumClusterNodes());

    Configuration conf = new Configuration();
    ConfigHelper.setOutputColumnFamily(conf, keyspace, colfamily);
    ConfigHelper.setOutputInitialAddress(conf, seedNodeHost);
    ConfigHelper.setOutputRpcPort(conf, seedNodePort);
    ConfigHelper.setOutputPartitioner(conf, partitionerClass);

    if (cmdLine.hasOption('s')) {
        conf.set("mapreduce.output.bulkoutputformat.buffersize", cmdLine.getOptionValue('s', "32"));
    }

    if (cmdLine.hasOption('M')) {
        conf.set("mapreduce.output.bulkoutputformat.streamthrottlembits", cmdLine.getOptionValue('M'));
    }

    if (cmdLine.hasOption('C')) {
        ConfigHelper.setOutputCompressionClass(conf, cmdLine.getOptionValue('C'));
    }

    if (cmdLine.hasOption('b')) {
        conf.setBoolean("com.spotify.hdfs2cass.base64", true);
    }

    JobConf job = new JobConf(conf);

    if (mappers > 0)
        job.setNumMapTasks(mappers);
    if (reducers > 0)
        job.setNumReduceTasks(reducers);
    if (copiers > 0)
        job.set("mapred.reduce.parallel.copies", copiers.toString());

    if (poolName != null)
        job.set("mapred.fairscheduler.pool", poolName);

    // set the nodes as a param for the other hadoop nodes
    clusterInfo.setConf(job);

    String jobName = "bulkloader-hdfs-to-cassandra";
    if (cmdLine.hasOption('n'))
        jobName += "-" + cmdLine.getOptionValue('n');
    job.setJobName(jobName);
    job.setJarByClass(BulkLoader.class);

    job.setInputFormat(AvroAsTextInputFormat.class);

    for (String inputPath : inputPaths) {
        FileInputFormat.addInputPath(job, new Path(inputPath));
    }

    //map just outputs text, reduce sends to cassandra
    job.setMapperClass(MapToText.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setPartitionerClass(CassandraPartitioner.class);

    job.setReducerClass(ReduceTextToCassandra.class);
    job.setOutputKeyClass(ByteBuffer.class);
    job.setOutputValueClass(List.class);

    if (cmdLine.hasOption('s'))
        job.setOutputFormat(BulkOutputFormat.class);
    else
        job.setOutputFormat(ColumnFamilyOutputFormat.class);

    JobClient.runJob(job);
    return 0;
}

From source file:com.talis.mapreduce.wordcount.oldapi.WordCount.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getSimpleName());
        ToolRunner.printGenericCommandUsage(System.err);
        return -1;
    }/*from  w w w  . ja v  a 2  s.  co  m*/

    JobConf conf = new JobConf(getConf(), getClass());
    conf.setJobName("Word Count");

    FileInputFormat.addInputPath(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    conf.setMapperClass(WordCountMapper.class);
    conf.setCombinerClass(WordCountReducer.class);
    conf.setReducerClass(WordCountReducer.class);

    //      conf.setPartitionerClass(HashPartitioner.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    JobClient.runJob(conf);

    return 0;
}

From source file:com.TCG.Nutch_DNS.Generator.java

License:Apache License

/**
 * Generate fetchlists in one or more segments. Whether to filter URLs or not
 * is read from the crawl.generate.filter property in the configuration files.
 * If the property is not found, the URLs are filtered. Same for the
 * normalisation./*  w  w  w.j  av a 2s  . c om*/
 * 
 * @param dbDir
 *          Crawl database directory
 * @param segments
 *          Segments directory
 * @param numLists
 *          Number of reduce tasks
 * @param topN
 *          Number of top URLs to be selected
 * @param curTime
 *          Current time in milliseconds
 * 
 * @return Path to generated segment or null if no entries were selected
 * 
 * @throws IOException
 *           When an I/O error occurs
 */
public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter,
        boolean norm, boolean force, int maxNumSegments) throws IOException {

    Path tempDir = new Path(
            getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + UUID.randomUUID().toString());

    Path lock = new Path(dbDir, CrawlDb.LOCK_NAME);
    FileSystem fs = FileSystem.get(getConf());
    LockUtil.createLockFile(fs, lock, force);

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("Generator: starting at " + sdf.format(start));
    LOG.info("Generator: Selecting best-scoring urls due for fetch.");
    LOG.info("Generator: filtering: " + filter);
    LOG.info("Generator: normalizing: " + norm);
    if (topN != Long.MAX_VALUE) {
        LOG.info("Generator: topN: " + topN);
    }

    // map to inverted subset due for fetch, sort by score
    JobConf job = new NutchJob(getConf());
    job.setJobName("generate: select from " + dbDir);

    if (numLists == -1) { // for politeness make
        numLists = job.getNumMapTasks(); // a partition per fetch task
    }
    if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) {
        // override
        LOG.info("Generator: jobtracker is 'local', generating exactly one partition.");
        numLists = 1;
    }
    job.setLong(GENERATOR_CUR_TIME, curTime);
    // record real generation time
    long generateTime = System.currentTimeMillis();
    job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
    job.setLong(GENERATOR_TOP_N, topN);
    job.setBoolean(GENERATOR_FILTER, filter);
    job.setBoolean(GENERATOR_NORMALISE, norm);
    job.setInt(GENERATOR_MAX_NUM_SEGMENTS, maxNumSegments);

    FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(Selector.class);
    job.setPartitionerClass(Selector.class);
    job.setReducerClass(Selector.class);

    FileOutputFormat.setOutputPath(job, tempDir);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(FloatWritable.class);
    job.setOutputKeyComparatorClass(DecreasingFloatComparator.class);
    job.setOutputValueClass(SelectorEntry.class);
    job.setOutputFormat(GeneratorOutputFormat.class);

    try {
        JobClient.runJob(job);
    } catch (IOException e) {
        LockUtil.removeLockFile(fs, lock);
        fs.delete(tempDir, true);
        throw e;
    }

    // read the subdirectories generated in the temp
    // output and turn them into segments
    List<Path> generatedSegments = new ArrayList<Path>();

    FileStatus[] status = fs.listStatus(tempDir);
    try {
        for (FileStatus stat : status) {
            Path subfetchlist = stat.getPath();
            if (!subfetchlist.getName().startsWith("fetchlist-"))
                continue;
            // start a new partition job for this segment
            Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists);
            generatedSegments.add(newSeg);
        }
    } catch (Exception e) {
        LOG.warn("Generator: exception while partitioning segments, exiting ...");
        fs.delete(tempDir, true);
        return null;
    }

    if (generatedSegments.size() == 0) {
        LOG.warn("Generator: 0 records selected for fetching, exiting ...");
        LockUtil.removeLockFile(fs, lock);
        fs.delete(tempDir, true);
        return null;
    }

    if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) {
        // update the db from tempDir
        Path tempDir2 = new Path(
                getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + UUID.randomUUID().toString());

        job = new NutchJob(getConf());
        job.setJobName("generate: updatedb " + dbDir);
        job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
        for (Path segmpaths : generatedSegments) {
            Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME);
            FileInputFormat.addInputPath(job, subGenDir);
        }
        FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
        job.setInputFormat(SequenceFileInputFormat.class);
        job.setMapperClass(CrawlDbUpdater.class);
        job.setReducerClass(CrawlDbUpdater.class);
        job.setOutputFormat(MapFileOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(CrawlDatum.class);
        FileOutputFormat.setOutputPath(job, tempDir2);
        try {
            JobClient.runJob(job);
            CrawlDb.install(job, dbDir);
        } catch (IOException e) {
            LockUtil.removeLockFile(fs, lock);
            fs.delete(tempDir, true);
            fs.delete(tempDir2, true);
            throw e;
        }
        fs.delete(tempDir2, true);
    }

    LockUtil.removeLockFile(fs, lock);
    fs.delete(tempDir, true);

    long end = System.currentTimeMillis();
    LOG.info("Generator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));

    Path[] patharray = new Path[generatedSegments.size()];
    return generatedSegments.toArray(patharray);
}

From source file:com.TCG.Nutch_DNS.HostDb.java

License:Apache License

public static JobConf createJob(Configuration config, Path crawlDb) throws IOException {
    Path newCrawlDb = new Path(crawlDb, Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf job = new NutchJob(config);
    job.setJobName("crawldb " + crawlDb);

    Path current = new Path(crawlDb, CURRENT_NAME);
    if (FileSystem.get(job).exists(current)) {
        FileInputFormat.addInputPath(job, current);
    }/* w w w  .  j av a2  s. c om*/
    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(HostDbFilter.class);
    job.setReducerClass(HostDbReducer.class);

    FileOutputFormat.setOutputPath(job, newCrawlDb);
    job.setOutputFormat(MapFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);

    // https://issues.apache.org/jira/browse/NUTCH-1110
    job.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    return job;
}