Example usage for org.apache.hadoop.mapred JobConf get

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf get.

Prototype

public String get(String name)

Source Link

Document

Get the value of the name property, null if no such property exists.

Usage

From source file:com.ask.hive.hbase.HiveHBaseTextTableInputFormat.java

License:Apache License

public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException {

    String hbaseTableName = jobConf.get(HBaseSerDe.HBASE_TABLE_NAME);
    setHTable(new HTable(new HBaseConfiguration(jobConf), Bytes.toBytes(hbaseTableName)));
    String hbaseColumnsMapping = jobConf.get(HBaseSerDe.HBASE_COLUMNS_MAPPING);

    if (hbaseColumnsMapping == null) {
        throw new IOException("hbase.columns.mapping required for HBase Table.");
    }//from w  w w.  j a  va  2 s. co m

    List<String> hbaseColumnFamilies = new ArrayList<String>();
    List<String> hbaseColumnQualifiers = new ArrayList<String>();
    List<byte[]> hbaseColumnFamiliesBytes = new ArrayList<byte[]>();
    List<byte[]> hbaseColumnQualifiersBytes = new ArrayList<byte[]>();

    int iKey;
    try {
        iKey = HBaseSerDe.parseColumnMapping(hbaseColumnsMapping, hbaseColumnFamilies, hbaseColumnFamiliesBytes,
                hbaseColumnQualifiers, hbaseColumnQualifiersBytes);
    } catch (SerDeException se) {
        throw new IOException(se);
    }

    Scan scan = new Scan();

    // Take filter pushdown into account while calculating splits; this
    // allows us to prune off regions immediately.  Note that although
    // the Javadoc for the superclass getSplits says that it returns one
    // split per region, the implementation actually takes the scan
    // definition into account and excludes regions which don't satisfy
    // the start/stop row conditions (HBASE-1829).
    convertFilter(jobConf, scan, null, iKey);

    // REVIEW:  are we supposed to be applying the getReadColumnIDs
    // same as in getRecordReader?
    for (int i = 0; i < hbaseColumnFamilies.size(); i++) {
        if (i == iKey) {
            continue;
        }

        if (hbaseColumnQualifiers.get(i) == null) {
            scan.addFamily(hbaseColumnFamiliesBytes.get(i));
        } else {
            scan.addColumn(hbaseColumnFamiliesBytes.get(i), hbaseColumnQualifiersBytes.get(i));
        }
    }

    setScan(scan);
    Job job = new Job(jobConf);
    JobContext jobContext = new JobContext(job.getConfiguration(), job.getJobID());
    Path[] tablePaths = FileInputFormat.getInputPaths(jobContext);

    List<org.apache.hadoop.mapreduce.InputSplit> splits = super.getSplits(jobContext);
    InputSplit[] results = new InputSplit[splits.size()];

    for (int i = 0; i < splits.size(); i++) {
        results[i] = new HBaseSplit((TableSplit) splits.get(i), tablePaths[0]);
    }

    return results;
}

From source file:com.ask.hive.hbase.HiveHBaseTimeTableInputFormat.java

License:Apache License

public RecordReader<ImmutableBytesWritable, Result> getRecordReader(InputSplit split, JobConf jobConf,
        final Reporter reporter) throws IOException {

    HBaseSplit hbaseSplit = (HBaseSplit) split;
    TableSplit tableSplit = hbaseSplit.getSplit();
    String hbaseTableName = jobConf.get(HBaseSerDe.HBASE_TABLE_NAME);
    setHTable(new HTable(new HBaseConfiguration(jobConf), Bytes.toBytes(hbaseTableName)));
    String hbaseColumnsMapping = jobConf.get(HBaseSerDe.HBASE_COLUMNS_MAPPING);
    List<String> hbaseColumnFamilies = new ArrayList<String>();
    List<String> hbaseColumnQualifiers = new ArrayList<String>();
    List<byte[]> hbaseColumnFamiliesBytes = new ArrayList<byte[]>();
    List<byte[]> hbaseColumnQualifiersBytes = new ArrayList<byte[]>();

    int iKey;//w w  w. j av a2 s  .c  o  m
    try {
        iKey = HBaseSerDe.parseColumnMapping(hbaseColumnsMapping, hbaseColumnFamilies, hbaseColumnFamiliesBytes,
                hbaseColumnQualifiers, hbaseColumnQualifiersBytes);
    } catch (SerDeException se) {
        throw new IOException(se);
    }
    List<Integer> readColIDs = ColumnProjectionUtils.getReadColumnIDs(jobConf);

    if (hbaseColumnFamilies.size() < readColIDs.size()) {
        throw new IOException("Cannot read more columns than the given table contains.");
    }

    boolean addAll = (readColIDs.size() == 0);
    Scan scan = new Scan();
    boolean empty = true;

    if (!addAll) {
        for (int i : readColIDs) {
            if (i == iKey) {
                continue;
            }

            if (hbaseColumnQualifiers.get(i) == null) {
                scan.addFamily(hbaseColumnFamiliesBytes.get(i));
            } else {
                scan.addColumn(hbaseColumnFamiliesBytes.get(i), hbaseColumnQualifiersBytes.get(i));
            }

            empty = false;
        }
    }

    // The HBase table's row key maps to a Hive table column. In the corner case when only the
    // row key column is selected in Hive, the HBase Scan will be empty i.e. no column family/
    // column qualifier will have been added to the scan. We arbitrarily add at least one column
    // to the HBase scan so that we can retrieve all of the row keys and return them as the Hive
    // tables column projection.
    if (empty) {
        for (int i = 0; i < hbaseColumnFamilies.size(); i++) {
            if (i == iKey) {
                continue;
            }

            if (hbaseColumnQualifiers.get(i) == null) {
                scan.addFamily(hbaseColumnFamiliesBytes.get(i));
            } else {
                scan.addColumn(hbaseColumnFamiliesBytes.get(i), hbaseColumnQualifiersBytes.get(i));
            }

            if (!addAll) {
                break;
            }
        }
    }

    //setting start and end time for scanning
    setTime(jobConf, scan);
    // If Hive's optimizer gave us a filter to process, convert it to the
    // HBase scan form now.
    tableSplit = convertFilter(jobConf, scan, tableSplit, iKey);

    setScan(scan);

    Job job = new Job(jobConf);
    TaskAttemptContext tac = new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID()) {

        @Override
        public void progress() {
            reporter.progress();
        }
    };

    final org.apache.hadoop.mapreduce.RecordReader<ImmutableBytesWritable, Result> recordReader = createRecordReader(
            tableSplit, tac);

    return new RecordReader<ImmutableBytesWritable, Result>() {

        //@Override
        public void close() throws IOException {
            recordReader.close();
        }

        // @Override
        public ImmutableBytesWritable createKey() {
            return new ImmutableBytesWritable();
        }

        // @Override
        public Result createValue() {
            return new Result();
        }

        // @Override
        public long getPos() throws IOException {
            return 0;
        }

        // @Override
        public float getProgress() throws IOException {
            float progress = 0.0F;

            try {
                progress = recordReader.getProgress();
            } catch (InterruptedException e) {
                throw new IOException(e);
            }

            return progress;
        }

        // @Override
        public boolean next(ImmutableBytesWritable rowKey, Result value) throws IOException {

            boolean next = false;

            try {
                next = recordReader.nextKeyValue();

                if (next) {
                    rowKey.set(recordReader.getCurrentValue().getRow());
                    Writables.copyWritable(recordReader.getCurrentValue(), value);
                }
            } catch (InterruptedException e) {
                throw new IOException(e);
            }

            return next;
        }
    };
}

From source file:com.ask.hive.hbase.HiveHBaseTimeTableInputFormat.java

License:Apache License

/**
 * minimum time should be less than max time <br/>
 * otherwise filter will be skiped     *
 * @param jobConf/*from  w w  w  . j a  va  2  s .  c  om*/
 * @param scan
 * @throws IOException
 */
private void setTime(JobConf jobConf, Scan scan) throws IOException {
    long min = 0l;
    String mintime = jobConf.get("hbase.mintime");
    if (StringUtils.isNotEmpty(mintime)) {
        min = Long.parseLong(mintime);
    }
    String maxtime = jobConf.get("hbase.maxtime");
    if (StringUtils.isNotEmpty(maxtime)) {
        long l = Long.parseLong(maxtime);
        if (min <= l)
            scan.setTimeRange(min, l);
    } else if (min > 0) {
        long l = System.currentTimeMillis();
        if (min <= l)
            scan.setTimeRange(min, l);
    }
}

From source file:com.ask.hive.hbase.HiveHBaseTimeTableInputFormat.java

License:Apache License

/**
 * Converts a filter (which has been pushed down from Hive's optimizer)
 * into corresponding restrictions on the HBase scan.  The
 * filter should already be in a form which can be fully converted.
 *
 * @param jobConf    configuration for the scan
 * @param scan       the HBase scan object to restrict
 * @param tableSplit the HBase table split to restrict, or null
 *                   if calculating splits
 * @param iKey       0-based offset of key column within Hive table
 * @return converted table split if any/*from   w  w w  .j a v  a 2 s .c om*/
 */
private TableSplit convertFilter(JobConf jobConf, Scan scan, TableSplit tableSplit, int iKey)
        throws IOException {

    String filterExprSerialized = jobConf.get(TableScanDesc.FILTER_EXPR_CONF_STR);
    if (filterExprSerialized == null) {
        return tableSplit;
    }
    ExprNodeDesc filterExpr = Utilities.deserializeExpression(filterExprSerialized, jobConf);

    String columnNameProperty = jobConf.get(Constants.LIST_COLUMNS);
    List<String> columnNames = Arrays.asList(columnNameProperty.split(","));

    IndexPredicateAnalyzer analyzer = newIndexPredicateAnalyzer(columnNames.get(iKey));

    List<IndexSearchCondition> searchConditions = new ArrayList<IndexSearchCondition>();
    ExprNodeDesc residualPredicate = analyzer.analyzePredicate(filterExpr, searchConditions);

    // There should be no residual since we already negotiated
    // that earlier in HBaseStorageHandler.decomposePredicate.
    if (residualPredicate != null) {
        throw new RuntimeException("Unexpected residual predicate " + residualPredicate.getExprString());
    }

    // There should be exactly one predicate since we already
    // negotiated that also.
    if (searchConditions.size() != 1) {
        throw new RuntimeException("Exactly one search condition expected in push down");
    }

    // Convert the search condition into a restriction on the HBase scan
    IndexSearchCondition sc = searchConditions.get(0);
    ExprNodeConstantEvaluator eval = new ExprNodeConstantEvaluator(sc.getConstantDesc());
    byte[] startRow;
    try {
        ObjectInspector objInspector = eval.initialize(null);
        Object writable = eval.evaluate(null);
        ByteStream.Output serializeStream = new ByteStream.Output();
        LazyUtils.writePrimitiveUTF8(serializeStream, writable, (PrimitiveObjectInspector) objInspector, false,
                (byte) 0, null);
        startRow = new byte[serializeStream.getCount()];
        System.arraycopy(serializeStream.getData(), 0, startRow, 0, serializeStream.getCount());
    } catch (HiveException ex) {
        throw new IOException(ex);
    }

    // stopRow is exclusive, so pad it with a trailing 0 byte to
    // make it compare as the very next value after startRow
    byte[] stopRow = new byte[startRow.length + 1];
    System.arraycopy(startRow, 0, stopRow, 0, startRow.length);

    if (tableSplit != null) {
        tableSplit = new TableSplit(tableSplit.getTableName(), startRow, stopRow,
                tableSplit.getRegionLocation());
    }
    scan.setStartRow(startRow);
    scan.setStopRow(stopRow);
    // Add a WhileMatchFilter to make the scan terminate as soon
    // as we see a non-matching key.  This is probably redundant
    // since the stopRow above should already take care of it for us.

    scan.setFilter(
            new WhileMatchFilter(new RowFilter(CompareFilter.CompareOp.EQUAL, new BinaryComparator(startRow))));
    return tableSplit;
}

From source file:com.ask.hive.hbase.PentahoTableCustomInputFormat.java

License:Apache License

public void configure(JobConf job) {

    String tableName = job.get(INPUT_TABLE);

    // columns can be colFam:colName or colFam: 
    // the later can be used to set up a scan that 
    String colArg = job.get(COLUMN_LIST);

    if (StringUtils.isNotEmpty(colArg)) {
        String[] colNames = colArg.split(" ");
        byte[][] m_cols = new byte[colNames.length][];
        for (int i = 0; i < m_cols.length; i++) {
            String colN = colNames[i];
            m_cols[i] = Bytes.toBytes(colN);
        }/*from   w ww .j  a v a 2s  . c  o  m*/
        setInputColumns(m_cols);
    }

    try {
        setHTable(new HTable(HBaseConfiguration.create(job), tableName));
    } catch (Exception e) {
        //  PLOG.error(StringUtils.stringifyException(e));
    }

    // set our table record reader
    PentahoTableRecordReader rr = new PentahoTableRecordReader();
    String cacheSize = job.get(SCAN_CACHEDROWS);
    if (StringUtils.isNotEmpty(cacheSize)) {
        rr.setScanCacheRowSize(Integer.parseInt(cacheSize));
    }
    FilterList list = new FilterList(FilterList.Operator.MUST_PASS_ALL);
    String hvalue = job.get(SCAN_INCLUDE_FILTER);
    if (StringUtils.isNotEmpty(hvalue)) {
        String[] columns = hvalue.split(",");
        if (columns.length > 0) {
            for (String column : columns) {
                String[] fv = column.split(":");
                if (fv.length > 2) {
                    SingleColumnValueFilter rowfilter = new SingleColumnValueFilter(Bytes.toBytes(fv[0]),
                            Bytes.toBytes(fv[1]), CompareOp.EQUAL, Bytes.toBytes(fv[2]));
                    list.addFilter(rowfilter);
                }
            }
        }
    }
    String hexvalue = job.get(SCAN_EXCLUDE_FILTER);
    if (StringUtils.isNotEmpty(hexvalue)) {
        String[] columns = hexvalue.split(",");
        if (columns.length > 0) {
            for (String column : columns) {
                String[] fv = column.split(":");
                if (fv.length > 2) {
                    SingleColumnValueExcludeFilter rowfilter = new SingleColumnValueExcludeFilter(
                            Bytes.toBytes(fv[0]), Bytes.toBytes(fv[1]), CompareOp.NOT_EQUAL,
                            Bytes.toBytes(fv[2]));
                    list.addFilter(rowfilter);
                }
            }
        }
    }
    String hmax = job.get("hbase.max.version");
    if (StringUtils.isNotEmpty(hmax)) {
        rr.setMaxVersion(Integer.parseInt(hmax));
    }
    rr.setValueFilter(list);
    setTableRecordReader(rr);
}

From source file:com.ask.hive.hbase.PentahoTableCustomInputFormat.java

License:Apache License

public void validateInput(JobConf job) throws IOException {
    // expecting a table name
    String tableName = job.get(INPUT_TABLE);
    if (StringUtils.isEmpty(tableName)) {
        throw new IOException("expecting one table name");
    }//from www .  jav  a 2s .  c om

    // connected to table?                                                                                                                             
    if (getHTable() == null) {
        throw new IOException("could not connect to table '" + tableName + "'");
    }

    // expecting at least one column/column family                                                                                                                   
    String colArg = job.get(COLUMN_LIST);
    if (colArg == null || colArg.length() == 0) {
        throw new IOException("expecting at least one column/column family");
    }
}

From source file:com.bah.culvert.hive.CulvertInputFormat.java

License:Apache License

@Override
public InputSplit[] getSplits(JobConf conf, int numSplits) throws IOException {
    String expression = conf.get(TableScanDesc.FILTER_EXPR_CONF_STR);
    if (expression == null) {
        // return what??
    }/*  w w  w .  ja  v a 2  s  .co m*/
    ExprNodeDesc filterExpr = Utilities.deserializeExpression(expression, conf);
    CulvertPredicateHandlerDelegate predicateHandler = new CulvertPredicateHandlerDelegate();
    CulvertIndexMapping[] culvertIndexMappings = CulvertHiveUtils.getIndexMappings(conf);
    /*
     * there should be no residual predicate, just the pushed predicate, since
     * we already handled that in the storage handler.
     */
    List<IndexSearchCondition> indexSearchConditions = predicateHandler.decomposePredicateToList(filterExpr,
            culvertIndexMappings);
    List<Constraint> compiledConstraints = new ArrayList<Constraint>();
    Map<String, CRange> rangeMap = new HashMap<String, CRange>();
    // get start and end ranges from the conditions
    for (IndexSearchCondition condition : indexSearchConditions) {
        /*
         * The hive column is a simple string -- easy This column name will let us
         * get an index from the culvert configuration specified in the conf.
         */
        String hiveColumn = condition.getColumnDesc().getColumn();
        /*
         * the operation maps to the op name specified in
         * CulvertPredicateHandlerDelegate.createPredicateAnalyzerForCulvert()
         */
        String operation = condition.getComparisonOp();
        /*
         * We need to manipuate this quite a bit- we need to turn it into an
         * object and an object inspector, and then use the object inspector to
         * find a byte value for the key.
         */
        ExprNodeConstantEvaluator nodeEvaluator = new ExprNodeConstantEvaluator(condition.getConstantDesc());
        PrimitiveObjectInspector inspector;
        Object value;
        // we can assume that the inspector is a primitive
        try {
            inspector = (PrimitiveObjectInspector) nodeEvaluator.initialize(null);
            value = nodeEvaluator.evaluate(null);
        } catch (HiveException e) {
            throw new RuntimeException(
                    "Unable to get constant portion of query expression expression " + condition, e);
        }
        Object primitive = inspector.getPrimitiveJavaObject(value);
        // look for the storage type in the column mapping
        CulvertIndexMapping culvertIndexMapping = getMapping(hiveColumn, culvertIndexMappings);

        if (culvertIndexMapping == null) {
            throw new IllegalArgumentException("Could not find a culvert mapping for this hive column "
                    + hiveColumn + " out of " + Arrays.toString(culvertIndexMappings));
        }
        // we have a utility to get the byte representation based on the hive
        // column type and the actual primitive itself.
        byte[] culvertValue = CulvertHiveUtils.getBytesForPrimitiveObject(primitive,
                culvertIndexMapping.getType());
        CRange range = rangeMap.get(hiveColumn);
        if (range == null) {
            // ok, start a new range
            // we know we'll always want to insert for this operation
            if (operation.equals(GenericUDFOPEqual.class.getName())) {
                // =
                range = new CRange(culvertValue);
            } else if (operation.equals(GenericUDFOPGreaterThan.class.getName())) {
                // >
                range = new CRange(Bytes.lexIncrement(culvertValue), Bytes.START_END_KEY);
            } else if (operation.equals(GenericUDFOPEqualOrGreaterThan.class.getName())) {
                // =>
                range = new CRange(culvertValue, Bytes.START_END_KEY);
            } else if (operation.equals(GenericUDFOPEqualOrLessThan.class.getName())) {
                // =<
                range = new CRange(Bytes.START_END_KEY, culvertValue);
            }
            /*
             * TODO we can't implement < because lexDecrement is undefined (in math
             * speak). Need to work around that by pushing down inclusive/exclusive
             * on the CRange.
             */
            rangeMap.put(hiveColumn, range);
        } else {
            LexicographicBytesComparator c = LexicographicBytesComparator.INSTANCE;
            if (operation.equals(GenericUDFOPEqual.class.getName())) {
                // = (if two conflicting equals, the whole predicate becomes a no-op
                if (c.compare(range.getStart(), culvertValue) != 0) {
                    rangeMap.clear();
                    return new InputSplit[0];
                }
            } else if (operation.equals(GenericUDFOPGreaterThan.class.getName())) {
                // >
                // narrow the existing range
                byte[] exclusiveValue = Bytes.lexIncrement(culvertValue);
                int comp = c.compare(exclusiveValue, range.getEnd());
                if (comp < 1) {
                    rangeMap.put(hiveColumn, new CRange(Bytes.max(exclusiveValue, range.getStart())));
                } else if (comp == 0) {
                    // no-op - this is the same as what we're doing already
                } else {
                    // if we get conflicting statements, kill it all...
                    rangeMap.clear();
                    return new InputSplit[0];
                }
            } else if (operation.equals(GenericUDFOPEqualOrGreaterThan.class.getName())) {
                // =>
                // narrow the existing range
                int comp = c.compare(culvertValue, range.getEnd());
                if (comp < 1) {
                    rangeMap.put(hiveColumn, new CRange(Bytes.max(culvertValue, range.getStart())));
                } else if (comp == 0) {
                    // no-op - this is the same as what we're doing already
                } else {
                    // if we get conflicting statements, kill it all...
                    rangeMap.clear();
                    return new InputSplit[0];
                }
            } else if (operation.equals(GenericUDFOPEqualOrLessThan.class.getName())) {
                // =<
                int comp = c.compare(culvertValue, range.getStart());
                if (comp > 1) {
                    rangeMap.put(hiveColumn, new CRange(Bytes.min(culvertValue, range.getEnd())));
                } else if (comp == 0) {
                    // no-op
                } else {
                    // if we get conflicting statements, kill it...
                    rangeMap.clear();
                    return new InputSplit[0];
                }
            }
        }
    }
    // <-- end getting start/end ranges
    Client culvertClient = new Client(conf);

    Index[] culvertIndices = culvertClient.getIndicesForTable(CulvertHiveUtils.getCulvertTable(conf));
    // create a map with the broken up ranges
    SortedMap<String, List<Constraint>> brokenRanges = new TreeMap<String, List<Constraint>>();
    for (Entry<String, CRange> rangeEntry : rangeMap.entrySet()) {
        String hiveColumn = rangeEntry.getKey();
        CRange totalRange = rangeEntry.getValue();
        Index indexToUse = getIndexForHiveColumn(hiveColumn, culvertIndexMappings, culvertIndices);
        /*
         * There should be an index from when we did the predicate pushdown in the
         * storage handler
         */
        if (indexToUse == null) {
            throw new RuntimeException("No indices exist over the requested column");
        }
        /*
         * Set the start and end on the splits to start where the overall range
         * starts and ends so we don't query the entire shard, which would give
         * the wrong result.
         */
        List<CRange> startEnds = indexToUse.getSplits();
        List<Constraint> newRanges = new ArrayList<Constraint>();
        if (startEnds.size() == 1) {
            newRanges.add(new IndexRangeConstraint(indexToUse, totalRange));
            brokenRanges.put(hiveColumn, newRanges);
            continue;
        }
        assert (startEnds.size() != 0);
        CRange firstShard = startEnds.get(0);
        CRange first = new CRange(totalRange.getStart(), firstShard.getEnd());
        newRanges.add(new IndexRangeConstraint(indexToUse, first));
        for (int i = 1; i < startEnds.size() - 1; i++) {
            newRanges.add(new IndexRangeConstraint(indexToUse, startEnds.get(i)));
        }
        CRange lastShard = startEnds.get(startEnds.size() - 1);
        CRange last = new CRange(lastShard.getStart(), totalRange.getEnd());
        newRanges.add(new IndexRangeConstraint(indexToUse, last));
        brokenRanges.put(hiveColumn, newRanges);
    }
    // <-- finished creating fragment map

    /*
     * Finally, take the cross product of all the fragments in order to find the
     * splits.
     */
    List<List<Constraint>> andRangeList = recursiveBuildRangeTuples(brokenRanges);
    InputSplit[] splits = new InputSplit[andRangeList.size()];
    for (int i = 0; i < splits.length; i++) {
        List<Constraint> constraints = andRangeList.get(i);
        Set<String> locations = new HashSet<String>();
        for (Constraint c : constraints) {
            locations.addAll(((IndexRangeConstraint) c).getIndex().getPreferredHosts());
        }
        // add field selection too...
        List<Constraint> fieldSelectingConstraints = new ArrayList<Constraint>(constraints.size());
        // TODO - wrap these constraints in field selecting constraints now that
        // we've pulled the preferred hosts out of them
        splits[i] = new CulvertInputSplit(new And(constraints), locations);
    }
    return null;
}

From source file:com.benchmark.mapred.Join.java

License:Apache License

/**
 * The main driver for sort program.//from w  ww.  j  av  a2  s.  c om
 * Invoke this method to submit the map/reduce job.
 * @throws IOException When there is communication problems with the 
 *                     job tracker.
 */
public int run(String[] args) throws Exception {
    JobConf jobConf = new JobConf(getConf(), Sort.class);
    jobConf.setJobName("join");

    jobConf.setMapperClass(IdentityMapper.class);
    jobConf.setReducerClass(IdentityReducer.class);

    JobClient client = new JobClient(jobConf);
    ClusterStatus cluster = client.getClusterStatus();
    int num_maps = cluster.getTaskTrackers() * jobConf.getInt("test.sort.maps_per_host", 10);
    int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9);
    String sort_reduces = jobConf.get("test.sort.reduces_per_host");
    if (sort_reduces != null) {
        num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces);
    }
    Class<? extends InputFormat> inputFormatClass = SequenceFileInputFormat.class;
    Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class;
    Class<? extends WritableComparable> outputKeyClass = BytesWritable.class;
    Class<? extends Writable> outputValueClass = TupleWritable.class;
    String op = "inner";
    List<String> otherArgs = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-m".equals(args[i])) {
                num_maps = Integer.parseInt(args[++i]);
            } else if ("-r".equals(args[i])) {
                num_reduces = Integer.parseInt(args[++i]);
            } else if ("-inFormat".equals(args[i])) {
                inputFormatClass = Class.forName(args[++i]).asSubclass(InputFormat.class);
            } else if ("-outFormat".equals(args[i])) {
                outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class);
            } else if ("-outKey".equals(args[i])) {
                outputKeyClass = Class.forName(args[++i]).asSubclass(WritableComparable.class);
            } else if ("-outValue".equals(args[i])) {
                outputValueClass = Class.forName(args[++i]).asSubclass(Writable.class);
            } else if ("-joinOp".equals(args[i])) {
                op = args[++i];
            } else {
                otherArgs.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage(); // exits
        }
    }

    // Set user-supplied (possibly default) job configs
    jobConf.setNumMapTasks(num_maps);
    jobConf.setNumReduceTasks(num_reduces);

    if (otherArgs.size() < 2) {
        System.out.println("ERROR: Wrong number of parameters: ");
        return printUsage();
    }

    FileOutputFormat.setOutputPath(jobConf, new Path(otherArgs.remove(otherArgs.size() - 1)));
    List<Path> plist = new ArrayList<Path>(otherArgs.size());
    for (String s : otherArgs) {
        plist.add(new Path(s));
    }

    jobConf.setInputFormat(CompositeInputFormat.class);
    jobConf.set("mapred.join.expr",
            CompositeInputFormat.compose(op, inputFormatClass, plist.toArray(new Path[0])));
    jobConf.setOutputFormat(outputFormatClass);

    jobConf.setOutputKeyClass(outputKeyClass);
    jobConf.setOutputValueClass(outputValueClass);

    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    JobClient.runJob(jobConf);
    Date end_time = new Date();
    System.out.println("Job ended: " + end_time);
    System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds.");
    return 0;
}

From source file:com.benchmark.mapred.Sort.java

License:Apache License

/**
 * The main driver for sort program.//from   www  .  j  a v  a2  s . co  m
 * Invoke this method to submit the map/reduce job.
 * @throws IOException When there is communication problems with the 
 *                     job tracker.
 */
public int run(String[] args) throws Exception {

    JobConf jobConf = new JobConf(getConf(), Sort.class);
    jobConf.setJobName("sorter");

    jobConf.setMapperClass(IdentityMapper.class);
    jobConf.setReducerClass(IdentityReducer.class);

    JobClient client = new JobClient(jobConf);
    ClusterStatus cluster = client.getClusterStatus();
    int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9);
    String sort_reduces = jobConf.get("test.sort.reduces_per_host");
    if (sort_reduces != null) {
        num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces);
    }
    Class<? extends InputFormat> inputFormatClass = SequenceFileInputFormat.class;
    Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class;
    Class<? extends WritableComparable> outputKeyClass = BytesWritable.class;
    Class<? extends Writable> outputValueClass = BytesWritable.class;
    List<String> otherArgs = new ArrayList<String>();
    InputSampler.Sampler<K, V> sampler = null;
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-m".equals(args[i])) {
                jobConf.setNumMapTasks(Integer.parseInt(args[++i]));
            } else if ("-r".equals(args[i])) {
                num_reduces = Integer.parseInt(args[++i]);
            } else if ("-inFormat".equals(args[i])) {
                inputFormatClass = Class.forName(args[++i]).asSubclass(InputFormat.class);
            } else if ("-outFormat".equals(args[i])) {
                outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class);
            } else if ("-outKey".equals(args[i])) {
                outputKeyClass = Class.forName(args[++i]).asSubclass(WritableComparable.class);
            } else if ("-outValue".equals(args[i])) {
                outputValueClass = Class.forName(args[++i]).asSubclass(Writable.class);
            } else if ("-totalOrder".equals(args[i])) {
                double pcnt = Double.parseDouble(args[++i]);
                int numSamples = Integer.parseInt(args[++i]);
                int maxSplits = Integer.parseInt(args[++i]);
                if (0 >= maxSplits)
                    maxSplits = Integer.MAX_VALUE;
                sampler = new InputSampler.RandomSampler<K, V>(pcnt, numSamples, maxSplits);
            } else {
                otherArgs.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage(); // exits
        }
    }

    // Set user-supplied (possibly default) job configs
    jobConf.setNumReduceTasks(num_reduces);

    jobConf.setInputFormat(inputFormatClass);
    jobConf.setOutputFormat(outputFormatClass);

    jobConf.setOutputKeyClass(outputKeyClass);
    jobConf.setOutputValueClass(outputValueClass);

    // Make sure there are exactly 2 parameters left.
    if (otherArgs.size() != 2) {
        System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2.");
        return printUsage();
    }
    FileInputFormat.setInputPaths(jobConf, otherArgs.get(0));
    FileOutputFormat.setOutputPath(jobConf, new Path(otherArgs.get(1)));

    if (sampler != null) {
        System.out.println("Sampling input to effect total-order sort...");
        jobConf.setPartitionerClass(TotalOrderPartitioner.class);
        Path inputDir = FileInputFormat.getInputPaths(jobConf)[0];
        inputDir = inputDir.makeQualified(inputDir.getFileSystem(jobConf));
        Path partitionFile = new Path(inputDir, "_sortPartitioning");
        TotalOrderPartitioner.setPartitionFile(jobConf, partitionFile);
        InputSampler.<K, V>writePartitionFile(jobConf, sampler);
        URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning");
        DistributedCache.addCacheFile(partitionUri, jobConf);
        DistributedCache.createSymlink(jobConf);
    }

    System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from "
            + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf)
            + " with " + num_reduces + " reduces.");
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    jobResult = JobClient.runJob(jobConf);
    Date end_time = new Date();
    System.out.println("Job ended: " + end_time);
    System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds.");
    return 0;
}

From source file:com.bianfeng.bfas.hive.io.RealtimeInputFormat2.java

License:Apache License

public RecordReader<LongWritable, Text> getRecordReader(InputSplit genericSplit, JobConf job, Reporter reporter)
        throws IOException {

    reporter.setStatus(genericSplit.toString());
    String delimiter = job.get("textinputformat.record.delimiter");
    byte[] recordDelimiterBytes = null;
    if (null != delimiter)
        recordDelimiterBytes = delimiter.getBytes();
    return new LineRecordReader(job, (FileSplit) genericSplit, recordDelimiterBytes);
}