Example usage for org.apache.hadoop.mapred JobConf get

List of usage examples for org.apache.hadoop.mapred JobConf get

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf get.

Prototype

public String get(String name, String defaultValue) 

Source Link

Document

Get the value of the name.

Usage

From source file:com.bianfeng.bfas.hive.io.RealtimeInputFormat2.java

License:Apache License

/**
 * Get the list of input {@link Path}s for the map-reduce job.
 * /*from  w  w w.jav  a 2  s .  c  o m*/
 * @param conf The configuration of the job 
 * @return the list of input {@link Path}s for the map-reduce job.
 */
public static Path[] getInputPaths(JobConf conf) {
    String dirs = conf.get("mapred.input.dir", "");
    String[] list = StringUtils.split(dirs);
    Path[] result = new Path[list.length];
    for (int i = 0; i < list.length; i++) {
        result[i] = new Path(StringUtils.unEscapeString(list[i]));
    }
    return result;
}

From source file:com.blackberry.logdriver.mapred.BinaryRecordWriter.java

License:Apache License

public BinaryRecordWriter(JobConf job) {
    String extension = job.get("output.file.extension", "");

    String taskid = job.get("mapred.task.id");
    try {/*w ww.  ja v a2s .  c o  m*/
        Path outputPath = BinaryOutputFormat.getTaskOutputPath(job, taskid + extension);

        FileSystem fs = FileSystem.get(job);
        LOG.info("Creating output path: {}", outputPath);
        out = fs.create(outputPath, true);
    } catch (IOException e) {
        LOG.error("Error creating output file.", e);
    }
}

From source file:com.cloudera.hive.scd.SQLUpdater.java

License:Open Source License

private List<String> loadUpdateStatements(InputSplit split, JobConf jc) throws IOException {
    long currentSCDTime = asSCDTime(jc.get("scd.time", ""), System.currentTimeMillis());
    List<String> stmts = Lists.newArrayList();
    if (split instanceof FileSplit) {
        Path base = ((FileSplit) split).getPath();
        FileSystem fs = base.getFileSystem(jc);
        Path updates = new Path(base.getParent(), ".updates");
        if (fs.exists(updates)) {
            stmts.addAll(readLines(fs, updates, currentSCDTime));
        }//  ww w . ja va2  s.co  m
    }
    return stmts;
}

From source file:com.davidgildeh.hadoop.input.simpledb.SimpleDBInputFormat.java

License:Apache License

/**
 * Main method to generate splits from SimpleDB. Takes a start and end date range
 * and generates split periods to filter SimpleDB based on split period given in 
 * configuration//  w w w . jav  a  2s . co m
 * 
 * @param jobConf       The Map Task Job Configuration
 * @param numSplits     Hint to calculate the number of splits
 * @return              The Input Splits
 * @throws IOException 
 */
public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException {

    // Get the splitsize (number of rows), defaults to 100,000 as much larger seems to 
    // screw up getting accurate rows
    int splitSize = Integer
            .parseInt(jobConf.get(SimpleDBInputFormat.SIMPLEDB_SPLIT_SIZE, String.valueOf(MAX_SPLIT_SIZE)));
    if (splitSize > MAX_SPLIT_SIZE) {
        splitSize = MAX_SPLIT_SIZE;
    }

    // Get total number of rows to calculate number of splits required
    SimpleDBDAO sdb = new SimpleDBDAO(jobConf);
    long totalItems;
    if (sdb.getWhereQuery() != null) {
        totalItems = sdb.getCount();
    } else {
        totalItems = sdb.getTotalItemCount();
    }

    long totalSplits = 1;
    if (splitSize < totalItems) {
        totalSplits = totalItems / splitSize;
    }

    if (LOG.isDebugEnabled()) {
        LOG.debug("Total Rows:" + String.valueOf(totalItems));
        LOG.debug("Total Splits:" + String.valueOf(totalSplits));
    }

    // Array to hold splits
    ArrayList<SimpleDBInputSplit> splits = new ArrayList<SimpleDBInputSplit>();

    // Create Splits
    for (int i = 0; i < totalSplits; i++) {

        String splitToken = sdb.getSplitToken(i, splitSize);
        long startRow = i * splitSize;
        long endRow = startRow + splitSize;
        if (endRow > totalItems) {
            endRow = totalItems;
        }

        SimpleDBInputSplit split = new SimpleDBInputSplit(startRow, endRow, splitToken);
        splits.add(split);

        if (LOG.isDebugEnabled()) {
            LOG.debug("Created Split: " + split.toString());
        }
    }

    // Return array of splits
    return splits.toArray(new SimpleDBInputSplit[splits.size()]);
}

From source file:com.digitalpebble.behemoth.ClassifierJob.java

License:Apache License

@Override
public void configure(JobConf job) {
    super.configure(job);
    filter = DocumentFilter.getFilters(job);
    lowerCase = job.getBoolean("classification.tokenize", false);
    docFeaturename = job.get("classification.doc.feature.name", "label");

    String modelPath = job.get(ClassifierJob.modelNameParam);

    // optimisation for jvm reuse
    // do not reload the model
    if (classifier != null) {
        LOG.info("Reusing existing classifier [" + classifier.toString() + "]");
        return;//from ww w  . j a v  a2s. c  o  m
    }

    long start = System.currentTimeMillis();
    File modelFile = null;
    try {
        String modelCacheName = new Path(modelPath).getName();
        Path[] cacheFiles = DistributedCache.getLocalCacheArchives(job);
        if (null != cacheFiles && cacheFiles.length > 0) {
            for (Path cachePath : cacheFiles) {
                LOG.info("LocalCache : " + cachePath.toUri());
                LOG.info("modelCacheName : " + modelCacheName);
                if (cachePath.toUri().toString().endsWith(modelCacheName)) {
                    String parent = new File(cachePath.toUri().getPath()).toString();
                    modelFile = new File(parent, modelCacheName.replaceAll(".zip", ""));
                    LOG.info("Unzipped ? " + modelFile.getAbsolutePath());
                    boolean doesExist = modelFile.exists();
                    LOG.info("modelFile exists " + doesExist);
                    // if it does not exist it must have been unpacked at
                    // the parent level
                    if (!doesExist) {
                        modelFile = new File(parent);
                    }
                    break;
                }
            }
        }
    } catch (IOException ioe) {
        throw new RuntimeException("Impossible to retrieve model from distributed cache", ioe);
    }

    try {
        classifier = classifier.getClassifier(modelFile);
    } catch (Exception e) {
        throw new RuntimeException("Impossible to load model from " + modelFile, e);
    }
    long end = System.currentTimeMillis();
    LOG.info("Model loaded in " + (end - start) + " msec");
}

From source file:com.ebay.erl.mobius.core.builder.TSVDatasetBuilder.java

License:Apache License

/**
 * {@inheritDoc}//from   ww w .  ja  va2s .c  om
 */
@Override
public Dataset buildFromPreviousJob(JobConf prevJob, Class<? extends FileOutputFormat> prevJobOutputFormat,
        String[] schema) throws IOException {
    if (prevJobOutputFormat.equals(TextOutputFormat.class)) {
        // no need to validate the input path as it's coming from 
        // previous dataset
        this.addInputPath(false, FileOutputFormat.getOutputPath(prevJob));
        this.setSchema(schema);
        this.setDelimiter(prevJob.get(ConfigureConstants.TUPLE_TO_STRING_DELIMITER, "\t"));
        return this.build();
    } else {
        throw new IllegalArgumentException(this.getClass().getCanonicalName() + " cannot build dataset from "
                + prevJobOutputFormat.getCanonicalName() + ", only " + TextOutputFormat.class.getCanonicalName()
                + " is supported.");
    }
}

From source file:com.ebay.erl.mobius.core.datajoin.EvenlyPartitioner.java

License:Apache License

/**
 * Get the path to the SequenceFile storing the sorted partition keyset.
 * @see #setPartitionFile(JobConf,Path)//from w  w  w.ja  va2 s . c  o m
 */
public static String getPartitionFile(JobConf job) {
    return job.get("total.order.partitioner.path", DEFAULT_PATH);
}

From source file:com.ebay.erl.mobius.core.mapred.ConfigurableJob.java

License:Apache License

@Override
protected synchronized void submit() {
    JobConf jobConf = this.getJobConf();
    boolean isLocalHadoop = jobConf.get("mapred.job.tracker", "local").equals("local");

    // the default partitioner is {@link com.ebay.erl.mobius.core.datajoin.DataJoinKeyPartitioner}
    // which is hash based.
    ///* w  w  w  . ja  va2 s  .com*/
    // If user choose to use even partitioner, Mobius will use
    // {@link com.ebay.erl.mobius.core.datajoin.EvenlyPartitioner} which
    // is sampling based partitioner of attempting to balance the load
    // for each reducer.
    String partitioner = jobConf.get("mobius.partitioner", "default");

    if (!isLocalHadoop && jobConf.getNumReduceTasks() != 0 && partitioner.equals("even")) {
        // this job needs reducer, perform sampling on the keys to 
        // make load on reducers are almost evenly distributed.

        double freq = jobConf.getFloat("mobius.sampler.freq", 0.1F);
        int numSamples = jobConf.getInt("mobius.sampler.num.samples", 50000);
        int maxSplits = jobConf.getInt("mobius.sampler.max.slipts.sampled", 5);

        // log sampling parameters so that user knows.
        LOGGER.info("Sampling parameters { " + "mobius.sampler.freq:" + format.format(freq) + ", "
                + "mobius.sampler.num.samples:" + numSamples + ", " + "mobius.sampler.max.slipts.sampled:"
                + maxSplits + "}");

        InputSampler.Sampler<?, ?> sampler = new MobiusInputSampler(freq, numSamples, maxSplits);

        writePartitionFile(jobConf, sampler);

        // add to distributed cache
        try {
            URI partitionUri = new URI(TotalOrderPartitioner.getPartitionFile(jobConf) + "#_partitions");
            LOGGER.info("Adding partition uri to distributed cache:" + partitionUri.toString());

            DistributedCache.addCacheFile(partitionUri, jobConf);
            DistributedCache.createSymlink(jobConf);
            jobConf.setPartitionerClass(EvenlyPartitioner.class);

            LOGGER.info("Using " + EvenlyPartitioner.class.getCanonicalName()
                    + " to partiton the keys evenly among reducers.");
        } catch (URISyntaxException e) {
            LOGGER.error(e.getMessage(), e);
            throw new RuntimeException(e);
        }

        // adding -XX:-UseParallelOldGC, this will automatically set -XX:-UseParallelGC
        // according to Oracle's specification
        String jvmOpts = jobConf.get("mapred.child.java.opts", "");
        if (jvmOpts.isEmpty()) {
            jvmOpts = "-XX:-UseParallelOldGC";
        } else {
            if (jvmOpts.indexOf("-XX:-UseParallelOldGC") < 0) {
                // remove "
                jvmOpts = jvmOpts.replaceAll("\"", "");
                jvmOpts = jvmOpts.concat(" -XX:-UseParallelOldGC");
            }
        }
        jobConf.set("mapred.child.java.opts", jvmOpts);

        this.setJobConf(jobConf);
    }
    LOGGER.info("Submiting job:" + jobConf.getJobName());
    super.submit();
}

From source file:com.ebay.erl.mobius.core.mapred.ConfigurableJob.java

License:Apache License

private static void writePartitionFile(JobConf job, Sampler sampler) {
    try {// www .  ja va 2s  .co m
        ////////////////////////////////////////////////
        // first, getting samples from the data sources
        ////////////////////////////////////////////////
        LOGGER.info("Running local sampling for job [" + job.getJobName() + "]");
        InputFormat inf = job.getInputFormat();
        Object[] samples = sampler.getSample(inf, job);
        LOGGER.info("Samples retrieved, sorting...");

        ////////////////////////////////////////////////
        // sort the samples
        ////////////////////////////////////////////////
        RawComparator comparator = job.getOutputKeyComparator();
        Arrays.sort(samples, comparator);

        if (job.getBoolean("mobius.print.sample", false)) {
            PrintWriter pw = new PrintWriter(
                    new OutputStreamWriter(new GZIPOutputStream(new BufferedOutputStream(new FileOutputStream(
                            new File(job.get("mobius.sample.file", "./samples.txt.gz")))))));
            for (Object obj : samples) {
                pw.println(obj);
            }
            pw.flush();
            pw.close();
        }

        ////////////////////////////////////////////////
        // start to write partition files
        ////////////////////////////////////////////////

        FileSystem fs = FileSystem.get(job);
        Path partitionFile = fs.makeQualified(new Path(TotalOrderPartitioner.getPartitionFile(job)));
        while (fs.exists(partitionFile)) {
            partitionFile = new Path(partitionFile.toString() + "." + System.currentTimeMillis());
        }
        fs.deleteOnExit(partitionFile);
        TotalOrderPartitioner.setPartitionFile(job, partitionFile);
        LOGGER.info("write partition file to:" + partitionFile.toString());

        int reducersNbr = job.getNumReduceTasks();
        Set<Object> wroteSamples = new HashSet<Object>();

        SequenceFile.Writer writer = SequenceFile.createWriter(fs, job, partitionFile, Tuple.class,
                NullWritable.class);

        float avgReduceSize = samples.length / reducersNbr;

        int lastBegin = 0;
        for (int i = 0; i < samples.length;) {
            // trying to distribute the load for every reducer evenly,
            // dividing the <code>samples</code> into a set of blocks
            // separated by boundaries, objects that selected from the
            // <code>samples</code> array, and each blocks should have
            // about the same size.

            // find the last index of element that equals to samples[i], as
            // such element might appear multiple times in the samples.
            int upperBound = Util.findUpperBound(samples, samples[i], comparator);

            int lowerBound = i;//Util.findLowerBound(samples, samples[i], comparator);

            // the repeat time of samples[i], if the key itself is too big
            // select it as boundary
            int currentElemSize = upperBound - lowerBound + 1;

            if (currentElemSize > avgReduceSize * 2) // greater than two times of average reducer size
            {
                // the current element is too big, greater than
                // two times of the <code>avgReduceSize</code>, 
                // put itself as boundary
                writer.append(((DataJoinKey) samples[i]).getKey(), NullWritable.get());
                wroteSamples.add(((DataJoinKey) samples[i]).getKey());
                //pw.println(samples[i]);

                // immediate put the next element to the boundary,
                // the next element starts at <code> upperBound+1
                // </code>, to prevent the current one consume even 
                // more.
                if (upperBound + 1 < samples.length) {
                    writer.append(((DataJoinKey) samples[upperBound + 1]).getKey(), NullWritable.get());
                    wroteSamples.add(((DataJoinKey) samples[upperBound + 1]).getKey());
                    //pw.println(samples[upperBound+1]);

                    // move on to the next element of <code>samples[upperBound+1]/code>
                    lastBegin = Util.findUpperBound(samples, samples[upperBound + 1], comparator) + 1;
                    i = lastBegin;
                } else {
                    break;
                }
            } else {
                // current element is small enough to be consider
                // with previous group
                int size = upperBound - lastBegin;
                if (size > avgReduceSize) {
                    // by including the current elements, we have
                    // found a block that's big enough, select it
                    // as boundary
                    writer.append(((DataJoinKey) samples[i]).getKey(), NullWritable.get());
                    wroteSamples.add(((DataJoinKey) samples[i]).getKey());
                    //pw.println(samples[i]);

                    i = upperBound + 1;
                    lastBegin = i;
                } else {
                    i = upperBound + 1;
                }
            }
        }

        writer.close();

        // if the number of wrote samples doesn't equals to number of
        // reducer minus one, then it means the key spaces is too small
        // hence TotalOrderPartitioner won't work, it works only if 
        // the partition boundaries are distinct.
        //
        // we need to change the number of reducers
        if (wroteSamples.size() + 1 != reducersNbr) {
            LOGGER.info("Write complete, but key space is too small, sample size=" + wroteSamples.size()
                    + ", reducer size:" + (reducersNbr));
            LOGGER.info("Set the reducer size to:" + (wroteSamples.size() + 1));

            // add 1 because the wrote samples define boundary, ex, if
            // the sample size is two with two element [300, 1000], then 
            // there should be 3 reducers, one for handling i<300, one 
            // for n300<=i<1000, and another one for 1000<=i
            job.setNumReduceTasks((wroteSamples.size() + 1));
        }

        samples = null;
    } catch (IOException e) {
        LOGGER.error(e.getMessage(), e);
        throw new RuntimeException(e);
    }
}

From source file:com.ebay.erl.mobius.core.mapred.MobiusInputSampler.java

License:Apache License

@Override
public Object[] getSample(InputFormat inf, JobConf job) throws IOException {
    // the following codes are copied from {@link InputSampler#RandomSampler},
    // but require some modifications.

    InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
    ArrayList<DataJoinKey> samples = new ArrayList<DataJoinKey>(this.numSamples);
    int splitsToSample = Math.min(this.maxSplitsSampled, splits.length);

    Random r = new Random();
    long seed = r.nextLong();
    r.setSeed(seed);/*from w  ww.  j  a va2  s . c  om*/

    // get Sorters
    Sorter[] sorters = null;
    if (job.get(ConfigureConstants.SORTERS, null) != null) {
        // total sort job
        sorters = (Sorter[]) SerializableUtil.deserializeFromBase64(job.get(ConfigureConstants.SORTERS), job);
    } else {
        // there is no sorter, should be reducer/join job
        Column[] keys = (Column[]) SerializableUtil
                .deserializeFromBase64(job.get(ConfigureConstants.ALL_GROUP_KEY_COLUMNS), job);
        sorters = new Sorter[keys.length];
        for (int i = 0; i < keys.length; i++) {
            sorters[i] = new Sorter(keys[i].getInputColumnName(), Ordering.ASC);
        }
    }

    long proportion = 10L;
    while ((int) (this.freq * proportion) == 0) {
        proportion = proportion * 10;
    }
    proportion = 5L * proportion;

    // shuffle splits
    for (int i = 0; i < splits.length; ++i) {
        InputSplit tmp = splits[i];
        int j = r.nextInt(splits.length);
        splits[i] = splits[j];
        splits[j] = tmp;
    }

    SamplingOutputCollector collector = new SamplingOutputCollector();
    for (int i = 0; i < splitsToSample || (i < splits.length && samples.size() < numSamples); i++) {
        LOGGER.info("Sampling from split #" + (i + 1) + ", collected samples:" + samples.size());

        RecordReader<WritableComparable, WritableComparable> reader = inf.getRecordReader(splits[i], job,
                Reporter.NULL);
        WritableComparable key = reader.createKey();
        WritableComparable value = reader.createValue();

        if (!(inf instanceof MobiusDelegatingInputFormat)) {
            // not mobius delegating input format, so the CURRENT_DATASET_ID
            // will not be set by inf#getRecordReader, we set them here.
            //
            // set the current dataset id, as the AbstractMobiusMapper#configure
            // method needs this property.
            job.set(ConfigureConstants.CURRENT_DATASET_ID, job.get(ConfigureConstants.ALL_DATASET_IDS));
        }

        Byte datasetID = Byte.valueOf(job.get(ConfigureConstants.CURRENT_DATASET_ID));
        LOGGER.info("Samples coming from dataset: " + datasetID.toString());
        AbstractMobiusMapper mapper = this.getMapper(inf, splits[i], job);
        mapper.configure(job);

        // reading elements from one split
        long readElement = 0;
        while (reader.next(key, value)) {
            collector.clear();
            Tuple tuple = mapper.parse(key, value);

            readElement++;
            if (readElement > (((long) numSamples) * ((long) proportion))) {
                // a split might be very big (ex: a large gz file),
                // so we just need to read the 
                break;
            }

            if (r.nextDouble() <= freq) {
                if (samples.size() < numSamples) {
                    mapper.joinmap(key, value, collector, Reporter.NULL);
                    // joinmap function might generate more than one output key
                    // per <code>key</code> input. 
                    for (Tuple t : collector.getOutKey()) {
                        Tuple mt = Tuple.merge(tuple, t);
                        DataJoinKey nkey = this.getKey(mt, sorters, datasetID, mapper, job);
                        samples.add(nkey);
                    }
                } else {
                    // When exceeding the maximum number of samples, replace
                    // a random element with this one, then adjust the
                    // frequency to reflect the possibility of existing 
                    // elements being pushed out

                    mapper.joinmap(key, value, collector, Reporter.NULL);
                    for (Tuple t : collector.getOutKey()) {
                        int ind = r.nextInt(numSamples);
                        if (ind != numSamples) {
                            Tuple mt = Tuple.merge(tuple, t);
                            DataJoinKey nkey = this.getKey(mt, sorters, datasetID, mapper, job);
                            samples.set(ind, nkey);
                        }
                    }

                    freq *= (numSamples - collector.getOutKey().size()) / (double) numSamples;
                }
                key = reader.createKey();
                value = reader.createValue();
            }
        }
        reader.close();
    }
    LOGGER.info("Samples have been collected, return.");
    return samples.toArray();
}