Example usage for org.apache.hadoop.mapred JobConf get

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf get.

Prototype

public String get(String name)

Source Link

Document

Get the value of the name property, null if no such property exists.

Usage

From source file:kafka.etl.impl.SimpleKafkaETLJob.java

License:Apache License

public void execute() throws Exception {
    JobConf conf = createJobConf();
    RunningJob runningJob = new JobClient(conf).submitJob(conf);
    String id = runningJob.getJobID();
    System.out.println("Hadoop job id=" + id);
    runningJob.waitForCompletion();// w ww  . j a  va  2 s  .  c  o  m

    if (!runningJob.isSuccessful())
        throw new Exception("Hadoop ETL job failed! Please check status on http://"
                + conf.get("mapred.job.tracker") + "/jobdetails.jsp?jobid=" + id);
}

From source file:me.tingri.graphs.gimv.JoinMapper.java

License:Apache License

public void configure(JobConf conf) {
    fieldSeparator = conf.get(CONSTANTS.FIELD_SEPARATOR);
    vectorIndicator = conf.get(CONSTANTS.VECTOR_INDICATOR);
    makeSymmetric = (FLAGS.YES.getValue() == Integer.parseInt(conf.get(CONSTANTS.MAKE_SYMMETRIC))) ? FLAGS.YES
            : FLAGS.NO;/* ww w .  j a  va2 s .co m*/
}

From source file:me.tingri.graphs.gimv.JoinReducer.java

License:Apache License

public void configure(JobConf conf) {
    vectorIndicator = conf.get(CONSTANTS.VECTOR_INDICATOR);
}

From source file:me.tingri.graphs.gimv.VectorGeneratorMapper.java

License:Apache License

public void configure(JobConf conf) {
    fieldSeparator = conf.get(CONSTANTS.FIELD_SEPARATOR);
    makeSymmetric = (FLAGS.YES.getValue() == Integer.parseInt(conf.get(CONSTANTS.MAKE_SYMMETRIC))) ? FLAGS.YES
            : FLAGS.NO;/*w w  w .  j a v a  2s.  com*/
}

From source file:microbench.TeraSortOnHDFSDataLocal.java

License:Apache License

public static void main(String[] args)
        throws IOException, InterruptedException, URISyntaxException, MPIException {
    try {/*from   w  ww  .  jav a2 s.  c  o  m*/
        if (!TaskAttemptContext.class.isInterface()) {
            throw new IOException("Currently TeraSort benchmark is supported under Hadoop-2.x runtime");
        }

        parseArgs(args);
        HashMap<String, String> conf = new HashMap<String, String>();
        initConf(conf);
        JobConf jobConf = new JobConf(confPath);
        conf.put(FS_DEFALUT_NAME, jobConf.get(FS_DEFALUT_NAME));

        MPI_D.Init(args, MPI_D.Mode.Common, conf);

        if (MPI_D.COMM_BIPARTITE_O != null) {
            // O communicator
            int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O);
            if (rank == 0) {
                System.out.println(TeraSortOnHDFSDataLocal.class.getSimpleName() + " O start.");
                DataMPIUtil.printArgs(args);
            }

            HadoopReader<Text, Text> reader = HadoopIOUtil.getReader(jobConf, inDir, TeraInputFormat.class,
                    rank, MPI_D.COMM_BIPARTITE_O);
            Text khead = reader.createKey();
            Text vhead = reader.createValue();
            while (reader.next(khead, vhead)) {
                // send key-value
                MPI_D.Send(khead, vhead);
            }
            reader.close();
        } else if (MPI_D.COMM_BIPARTITE_A != null) {
            // A communicator
            int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_A);
            if (rank == 0) {
                System.out.println(TeraSortOnHDFSDataLocal.class.getSimpleName() + " A start.");
            }

            HadoopWriter<Text, Text> outrw = HadoopIOUtil.getNewWriter(jobConf, outDir, Text.class, Text.class,
                    TeraOutputFormat.class, null, rank, MPI_D.COMM_BIPARTITE_A);
            // recv key-value.
            Object[] keyValue = MPI_D.Recv();
            while (keyValue != null) {
                outrw.write((Text) keyValue[0], (Text) keyValue[1]);
                keyValue = MPI_D.Recv();
            }
            outrw.close();
        }
        MPI_D.Finalize();
    } catch (MPI_D_Exception e) {
        e.printStackTrace();
    }
}

From source file:net.darkseraphim.webanalytics.hadoop.csv.CSVTextInputFormat.java

License:Apache License

@Override
public RecordReader<LongWritable, List<Text>> getRecordReader(InputSplit split, JobConf conf, Reporter reporter)
        throws IOException {
    String quote = conf.get(CSVLineRecordReader.FORMAT_DELIMITER);
    String separator = conf.get(CSVLineRecordReader.FORMAT_SEPARATOR);
    conf.set(CSVLineRecordReader.FORMAT_DELIMITER, CSVLineRecordReader.DEFAULT_DELIMITER);
    conf.set(CSVLineRecordReader.FORMAT_SEPARATOR, CSVLineRecordReader.DEFAULT_SEPARATOR);
    conf.setBoolean(CSVLineRecordReader.IS_ZIPFILE, false);
    System.out.println("[LOG] Created reader");
    if (split instanceof FileSplit) {
        return reader = new CSVLineRecordReader(split, conf);
    }//w  w  w  .j av a2 s  . c om
    throw new UnsupportedOperationException("Only FileSplits are supported");
}

From source file:net.iponweb.hadoop.streaming.avro.AvroAsJsonOutputFormat.java

License:Apache License

@Override
public RecordWriter<Text, NullWritable> getRecordWriter(FileSystem ignore, JobConf job, String name,
        Progressable prog) throws IOException {

    Schema schema;//from w  w w .ja  v a  2  s. com
    Schema.Parser p = new Schema.Parser();
    String strSchema = job.get("iow.streaming.output.schema");
    if (strSchema == null) {

        String schemaFile = job.get("iow.streaming.output.schema.file", "streaming_output_schema");

        if (job.getBoolean("iow.streaming.schema.use.prefix", false)) {
            // guess schema from file name
            // format is: schema:filename
            // with special keyword default - 'default:filename'

            String str[] = name.split(":");
            if (!str[0].equals("default"))
                schemaFile = str[0];

            name = str[1];
        }

        LOG.info(this.getClass().getSimpleName() + ": Using schema from file: " + schemaFile);
        File f = new File(schemaFile);
        schema = p.parse(f);
    } else {
        LOG.info(this.getClass().getSimpleName() + ": Using schema from jobconf.");
        schema = p.parse(strSchema);
    }

    if (schema == null) {
        throw new IOException("Can't find proper output schema");
    }

    DataFileWriter<GenericRecord> writer = new DataFileWriter<GenericRecord>(
            new GenericDatumWriter<GenericRecord>());

    configureDataFileWriter(writer, job);

    Path path = FileOutputFormat.getTaskOutputPath(job, name + org.apache.avro.mapred.AvroOutputFormat.EXT);
    writer.create(schema, path.getFileSystem(job).create(path));

    return createRecordWriter(writer, schema);
}

From source file:net.iponweb.hadoop.streaming.parquet.ParquetAsTextOutputFormat.java

License:Apache License

public RecordWriter<Text, Text> getRecordWriter(FileSystem fs, JobConf job, String name, Progressable progress)
        throws IOException {

    // find and load schema

    String writeSchema = job.get("iow.streaming.output.schema");
    MessageType s;/*  w  w  w .  ja  v  a 2  s .  c o m*/

    if (writeSchema == null) {

        String schemaFile = job.get("iow.streaming.output.schema.file", "streaming_output_schema");

        if (job.getBoolean("iow.streaming.schema.use.prefix", false)) {
            // guess schema from file name
            // format is: schema:filename
            // with special keyword default - 'default:filename'

            String str[] = name.split(":");
            if (!str[0].equals("default"))
                schemaFile = str[0];

            name = str[1];
        }

        LOG.info("Using schema: " + schemaFile);
        File f = new File(schemaFile);
        try {
            BufferedReader reader = new BufferedReader(new FileReader(f));
            StringBuilder r = new StringBuilder();
            String line;
            while ((line = reader.readLine()) != null)
                r.append(line);

            writeSchema = r.toString();

        } catch (Throwable e) {
            LOG.error("Can't read schema file " + schemaFile);
            Throwables.propagateIfPossible(e, IOException.class);
            throw new RuntimeException(e);
        }
    }
    s = MessageTypeParser.parseMessageType(writeSchema);

    setWriteSupportClass(job, GroupWriteSupport.class);
    GroupWriteSupport.setSchema(s, job);

    CompressionCodecName codec = getCodec(job);
    String extension = codec.getExtension() + ".parquet";
    Path file = getDefaultWorkFile(job, name, extension);

    ParquetRecordWriter<SimpleGroup> realWriter;
    try {
        realWriter = (ParquetRecordWriter<SimpleGroup>) realOutputFormat.getRecordWriter(job, file, codec);
    } catch (InterruptedException e) {
        Thread.interrupted();
        throw new IOException(e);
    }

    return createRecordWriter(realWriter, fs, job, name, progress);
}

From source file:net.peacesoft.nutch.crawl.ReFetcher.java

License:Apache License

public void configure(JobConf job) {
    setConf(job);/*ww  w. j a  v a2s . c  o m*/

    this.segmentName = job.get(Nutch.SEGMENT_NAME_KEY);
    this.storingContent = isStoringContent(job);
    this.parsing = isParsing(job);
}

From source file:net.peacesoft.nutch.crawl.ReGenerator.java

License:Apache License

/**
 * Generate fetchlists in one or more segments. Whether to filter URLs or
 * not is read from the crawl.generate.filter property in the configuration
 * files. If the property is not found, the URLs are filtered. Same for the
 * normalisation./*from   ww w .ja va 2 s.  co  m*/
 *
 * @param dbDir Crawl database directory
 * @param segments Segments directory
 * @param numLists Number of reduce tasks
 * @param topN Number of top URLs to be selected
 * @param curTime Current time in milliseconds
 *
 * @return Path to generated segment or null if no entries were selected
 *
 * @throws IOException When an I/O error occurs
 */
public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter,
        boolean norm, boolean force, int maxNumSegments) throws IOException {
    try {
        Path tempDir = new Path(
                getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

        Path lock = new Path(dbDir, CrawlDb.LOCK_NAME);
        FileSystem fs = FileSystem.get(getConf());
        LockUtil.createLockFile(fs, lock, force);

        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long start = System.currentTimeMillis();
        LOG.info("ReGenerator: starting at " + sdf.format(start));
        LOG.info("ReGenerator: Selecting best-scoring urls due for fetch.");
        LOG.info("ReGenerator: filtering: " + filter);
        LOG.info("ReGenerator: normalizing: " + norm);
        if (topN != Long.MAX_VALUE) {
            LOG.info("ReGenerator: topN: " + topN);
        }

        if ("true".equals(getConf().get(GENERATE_MAX_PER_HOST_BY_IP))) {
            LOG.info(
                    "ReGenerator: GENERATE_MAX_PER_HOST_BY_IP will be ignored, use partition.url.mode instead");
        }

        // map to inverted subset due for fetch, sort by score
        JobConf job = new NutchJob(getConf());
        job.setJobName("generate: select from " + dbDir);

        if (numLists == -1) { // for politeness make
            numLists = job.getNumMapTasks(); // a partition per fetch task
        }
        if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) {
            // override
            LOG.info("ReGenerator: jobtracker is 'local', generating exactly one partition.");
            numLists = 1;
        }
        job.setLong(GENERATOR_CUR_TIME, curTime);
        // record real generation time
        long generateTime = System.currentTimeMillis();
        job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
        job.setLong(GENERATOR_TOP_N, topN);
        job.setBoolean(GENERATOR_FILTER, filter);
        job.setBoolean(GENERATOR_NORMALISE, norm);
        job.setInt(GENERATOR_MAX_NUM_SEGMENTS, maxNumSegments);

        FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
        job.setInputFormat(SequenceFileInputFormat.class);

        job.setMapperClass(Selector.class);
        job.setPartitionerClass(Selector.class);
        job.setReducerClass(Selector.class);

        FileOutputFormat.setOutputPath(job, tempDir);
        job.setOutputFormat(SequenceFileOutputFormat.class);
        job.setOutputKeyClass(FloatWritable.class);
        job.setOutputKeyComparatorClass(DecreasingFloatComparator.class);
        job.setOutputValueClass(SelectorEntry.class);
        job.setOutputFormat(GeneratorOutputFormat.class);

        try {
            JobClient.runJob(job);
        } catch (IOException e) {
            throw e;
        }

        // read the subdirectories generated in the temp
        // output and turn them into segments
        List<Path> generatedSegments = new ArrayList<Path>();

        FileStatus[] status = fs.listStatus(tempDir);
        try {
            for (FileStatus stat : status) {
                Path subfetchlist = stat.getPath();
                if (!subfetchlist.getName().startsWith("fetchlist-")) {
                    continue;
                }
                // start a new partition job for this segment
                Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists);
                generatedSegments.add(newSeg);
            }
        } catch (Exception e) {
            LOG.warn("ReGenerator: exception while partitioning segments, exiting ...");
            fs.delete(tempDir, true);
            return null;
        }

        if (generatedSegments.size() == 0) {
            LOG.warn("ReGenerator: 0 records selected for fetching, exiting ...");
            LockUtil.removeLockFile(fs, lock);
            fs.delete(tempDir, true);
            return null;
        }

        if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) {
            // update the db from tempDir
            Path tempDir2 = new Path(
                    getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

            job = new NutchJob(getConf());
            job.setJobName("generate: updatedb " + dbDir);
            job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
            for (Path segmpaths : generatedSegments) {
                Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME);
                FileInputFormat.addInputPath(job, subGenDir);
            }
            FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
            job.setInputFormat(SequenceFileInputFormat.class);
            job.setMapperClass(CrawlDbUpdater.class);
            job.setReducerClass(CrawlDbUpdater.class);
            job.setOutputFormat(MapFileOutputFormat.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(CrawlDatum.class);
            FileOutputFormat.setOutputPath(job, tempDir2);
            try {
                JobClient.runJob(job);
                CrawlDb.install(job, dbDir);
            } catch (IOException e) {
                LockUtil.removeLockFile(fs, lock);
                fs.delete(tempDir, true);
                fs.delete(tempDir2, true);
                throw e;
            }
            fs.delete(tempDir2, true);
        }

        LockUtil.removeLockFile(fs, lock);
        fs.delete(tempDir, true);

        long end = System.currentTimeMillis();
        LOG.info("ReGenerator: finished at " + sdf.format(end) + ", elapsed: "
                + TimingUtil.elapsedTime(start, end));

        Path[] patharray = new Path[generatedSegments.size()];
        return generatedSegments.toArray(patharray);
    } catch (Exception ex) {
        LOG.error("ReGenerator generate error: " + ex.toString(), ex);
        return null;
    }
}