Example usage for org.apache.hadoop.mapreduce Job getConfiguration

List of usage examples for org.apache.hadoop.mapreduce Job getConfiguration

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job getConfiguration.

Prototype

public Configuration getConfiguration() 

Source Link

Document

Return the configuration for the job.

Usage

From source file:com.ery.hadoop.mrddx.hbase.HbaseOutputFormat.java

License:Apache License

@Override
public void handle(Job conf) throws Exception {
    HbaseConfiguration hConf = new HbaseConfiguration(conf.getConfiguration(),
            HbaseConfiguration.FLAG_HBASE_OUTPUT);

    // ??/*w  ww  .j  av  a  2s.  c  o m*/
    String tableName = hConf.getOutputHBaseTableName();
    if (null == tableName || tableName.trim().length() <= 0) {
        String meg = "HBase??<" + HbaseConfiguration.OUTPUT_TABLE + ">?.";
        LOG.error(meg);
        throw new Exception(meg);
    }

    // ?
    String hbaseFieldNames = hConf.getOutputHBaseFieldNames();
    this.vParamTargetFamilyNames(hbaseFieldNames, hConf);
    hConf.setOutputHBaseFamilyNames(this.getHBaseFamilyNames(hbaseFieldNames));

    // 
    String rowKeyRule = hConf.getOutputHBaseRowKeyRule();
    if (null == rowKeyRule || rowKeyRule.trim().length() <= 0) {
        String meg = "<" + HbaseConfiguration.OUTPUT_ROWKEY_RULE + ">";
        LOG.error(meg);
        throw new Exception(meg);
    }

    // HFile
    long hfileMaxfilesize = hConf.getOutputHBaseHFileMaxfilesize();
    if (hfileMaxfilesize <= 0) {
        String meg = "HFile<" + HbaseConfiguration.OUTPUT_HFILE_MAXFILESIZE + ">0";
        LOG.error(meg);
        throw new Exception(meg);
    }

    // memstore flushHDFS?
    long memstoreFlushSize = hConf.getOutputHBaseMemstoreFlushSize();
    if (memstoreFlushSize <= 0) {
        String meg = "memstore flushHDFS?<"
                + HbaseConfiguration.OUTPUT_MEMSTORE_FLUSHSIZE + ">??0";
        LOG.error(meg);
        throw new Exception(meg);
    }

    // ??
    int colmunBlocksize = hConf.getOutputHBaseColmunBlocksize();
    if (colmunBlocksize <= 0) {
        String meg = "??<" + HbaseConfiguration.OUTPUT_COLMUN_BLOCKSIZE + ">0";
        LOG.error(meg);
        throw new Exception(meg);
    }

    // ?
    int colmunMaxversion = hConf.getOutputHBaseColmunMaxversion();
    if (colmunMaxversion <= 0) {
        String meg = "?<" + HbaseConfiguration.OUTPUT_COLMUN_MAXVERSION + ">0";
        LOG.error(meg);
        throw new Exception(meg);
    }

    // ??
    int colmunMinversion = hConf.getOutputHBaseColmunMinversion();
    if (colmunMinversion <= 0) {
        String meg = "??<" + HbaseConfiguration.OUTPUT_COLMUN_MINVERSION + ">0";
        LOG.error(meg);
        throw new Exception(meg);
    }

    // ????
    int commitBufferLength = hConf.getOutputHBaseBufferLength();
    if (commitBufferLength <= 0) {
        String meg = "????<" + HbaseConfiguration.OUTPUT_SET_COMMIT_BUFFERLENGTH
                + ">0";
        LOG.error(meg);
        throw new Exception(meg);
    }

    // ?hbaseWAL
    int walFlag = hConf.getOutputHBaseSetWalFlags();
    if (!(walFlag == -1 || walFlag >= 0 || walFlag <= 4)) {
        String meg = "WAL<" + HbaseConfiguration.OUTPUT_SET_WAL_FLAG
                + ">?-1??:[0-4]";
        LOG.error(meg);
        throw new Exception(meg);
    }

    // 
    if (!validateTable(hConf)) {
        String errorInfo = "HBase output table, validate Execption!";
        MRLog.error(LOG, errorInfo);
        throw new Exception(errorInfo);
    }

    conf.setOutputFormatClass(HbaseOutputFormat.class);
    conf.setReduceSpeculativeExecution(false);
    conf.setOutputKeyClass(DBRecord.class);
    conf.setOutputValueClass(NullWritable.class);
    conf.setReducerClass(DBReducer.class);

    // ??
    printTableDesc(tableName, hConf.getConf());
}

From source file:com.ery.hadoop.mrddx.hive.HiveOutputFormat.java

License:Apache License

/**
 * ???/*from   w w w  . j  a v  a 2 s . c om*/
 * 
 * @param job jobconf
 * @param tableName ??
 */
public static void setOutput(Job job, String tableName) {
    job.setOutputFormatClass(HiveOutputFormat.class);
    job.setReduceSpeculativeExecution(false);
    HiveConfiguration dbConf = new HiveConfiguration(job.getConfiguration());
    dbConf.setOutputHiveTableName(tableName);
}

From source file:com.ery.hadoop.mrddx.hive.HiveOutputFormat.java

License:Apache License

@Override
public void handle(Job conf) throws Exception {
    /**//w ww .ja v a  2s .co m
     * ?
     */
    HiveConfiguration hconf = new HiveConfiguration(conf.getConfiguration());
    // ?
    String outRowChars = hconf.getOutputHiveFileRowsSplitChars();
    if (null == outRowChars || outRowChars.length() <= 0) {
        String meg = "<" + HiveConfiguration.OUTPUT_HIVE_ROWS_SPLITCHARS + ">";
        MRLog.error(LOG, meg);
        throw new Exception(meg);
    }

    // 
    String outFileSplitChars = hconf.getOutputHiveFileFieldSplitChars();
    if (null == outFileSplitChars || outFileSplitChars.trim().length() <= 0) {
        String meg = "<" + HiveConfiguration.OUTPUT_HIVE_FIELD_SPLITCHARS + ">";
        MRLog.error(LOG, meg);
        throw new Exception(meg);
    }

    boolean para = hconf.getOutputHiveCompress();
    // ? (?HDFSUtils.CompressCodec)
    String outCompressCodec = hconf.getOutputHiveCompressCodec();
    if (para && !HDFSUtils.isExistCompressCodec(outCompressCodec)) {
        String meg = "[MR ERROR]?<" + HiveConfiguration.OUTPUT_HIVE_COMPRESS_CODEC
                + ">?.";
        MRLog.error(LOG, meg);
        throw new Exception(meg);
    }

    // ?MR
    String outTargetpath = hconf.getOutputTargetFilePath();
    hconf.setOutputTargetPath(outTargetpath);
    if (null == outTargetpath || outTargetpath.trim().length() <= 0) {
        MRLog.warn(LOG,
                "MR<" + HiveConfiguration.OUTPUT_HIVE_TARGET_PATH + ">");
    }

    // ?hive??
    String hiveUrl = hconf.getOutPutHiveConfigUrl();
    if (null == hiveUrl || hiveUrl.trim().length() <= 0) {
        String meg = "[MR ERROR]Hive??<" + HiveConfiguration.OUTPUT_HIVE_CONFIG_URL
                + ">?.";
        LOG.error(meg);
        throw new Exception(meg);
    }

    // hive???
    String hiveUser = hconf.getOutPutHiveConfigUser();
    if (null == hiveUser || hiveUser.trim().length() <= 0) {
        LOG.warn("[MR WARN]hive???<" + HiveConfiguration.OUTPUT_HIVE_CONFIG_USER + ">.");
    }

    // hive??
    String hivePwd = hconf.getOutPutHiveConfigPassword();
    if (null == hivePwd || hivePwd.trim().length() <= 0) {
        LOG.warn("[MR WARN]hive??<" + HiveConfiguration.OUTPUT_HIVE_CONFIG_PASSWORD + ">.");
    }

    // ??
    String tableName = hconf.getOutputHiveTableName();
    if (null == tableName || tableName.trim().length() <= 0) {
        String meg = "[MR ERROR]Hive??<" + HiveConfiguration.OUTPUT_TABLE + ">?.";
        LOG.error(meg);
        throw new Exception(meg);
    }

    // ??
    String partitionField[] = hconf.getOutputHivePartitionField();
    if (null != partitionField && partitionField.length > 0) {
        // 
        String[] outputFieldName = hconf.getOutputFieldNames();
        if (null == outputFieldName || outputFieldName.length <= 0) {
            String meg = "<" + MRConfiguration.SYS_OUTPUT_FIELD_NAMES_PROPERTY + ">.";
            MRLog.error(LOG, meg);
            throw new Exception(meg);
        }

        for (int i = 0; i < partitionField.length; i++) {
            boolean isExist = false;
            for (String s : outputFieldName) {
                if (s.equals(partitionField[i])) {
                    isExist = true;
                    break;
                }
            }

            if (!isExist) {
                String meg = "" + partitionField[i] + "<"
                        + HiveConfiguration.OUTPUT_HIVE_PARTITION_FIELD + ">?<"
                        + MRConfiguration.SYS_OUTPUT_FIELD_NAMES_PROPERTY + "";
                MRLog.error(LOG, meg);
                throw new Exception(meg);
            }
        }

        String orderOutputTempPath = hconf.getOutputHiveOrderTempPath();
        if (null == orderOutputTempPath || orderOutputTempPath.trim().length() <= 0) {
            String meg = "<" + HiveConfiguration.OUTPUT_HIVE_ORDER_TEMP_PATH + ">.";
            MRLog.error(LOG, meg);
            throw new Exception(meg);
        }

        String orderOutputFileNamePrefix = hconf.getOutputHiveOrderFileNamePrefix();
        if (null == orderOutputFileNamePrefix || orderOutputFileNamePrefix.trim().length() <= 0) {
            String meg = "???<" + HiveConfiguration.OUTPUT_HIVE_ORDER_TEMP_PATH + ">.";
            MRLog.warn(LOG, meg);
        }

        long orderOutputFileMaxCount = hconf.getOutputHiveOrderFileMaxCount();
        if (orderOutputFileMaxCount == 0) {
            String meg = "?<" + HiveConfiguration.OUTPUT_HIVE_ORDER_FILEMAXCOUNT
                    + ">0 -1(??).";
            MRLog.error(LOG, meg);
            throw new Exception(meg);
        }
    }

    // DDL?
    String ddlHQL = hconf.getOutputHiveExecuteDDLHQL();
    if (null == ddlHQL || ddlHQL.trim().length() <= 0) {
        LOG.warn("[MR WARN]hive?<" + HiveConfiguration.OUTPUT_HIVE_DDL_HQL + ">.");
    }

    try {
        executeDDLHQL(hconf);
        MRLog.info(LOG, "execute ddl hive sql success!");
    } catch (SQLException e) {
        MRLog.error(LOG, "execute ddl hive sql error!");
        e.printStackTrace();
    }

    conf.setReduceSpeculativeExecution(false);
    conf.setOutputFormatClass(HiveOutputFormat.class);
    conf.setOutputKeyClass(DBRecord.class);
    conf.setOutputValueClass(NullWritable.class);
    if (null != partitionField && partitionField.length > 0) {
        conf.setCombinerClass(DBGroupReducer.class);
        conf.setReducerClass(DBPartitionReducer.class);
    } else {
        conf.setCombinerClass(DBGroupReducer.class);
        conf.setReducerClass(DBReducer.class);
    }
}

From source file:com.ery.hadoop.mrddx.hive.HiveRCFileOutputFormat.java

License:Apache License

@Override
public void handle(Job conf) throws Exception {
    super.handle(conf);
    HiveConfiguration hconf = new HiveConfiguration(conf.getConfiguration());
    // ? (?HDFSUtils.CompressCodec)
    String outCompressCodec = hconf.getOutputHiveCompressCodec();
    // ?BZip2Codec
    if (HDFSUtils.isBZip2CompressCodec(outCompressCodec)) {
        String meg = "[MR ERROR]?<" + HiveConfiguration.OUTPUT_HIVE_COMPRESS_CODEC
                + ">??BZip2Codec.";
        MRLog.error(LOG, meg);//from  w  ww .jav  a  2 s.c o  m
        throw new Exception(meg);
    }

    setColumnNumber(conf.getConfiguration(), hconf.getOutputFieldNames().length);
    conf.setOutputFormatClass(HiveRCFileOutputFormat.class);
}

From source file:com.fanlehai.hadoop.join.CompositeJoin.java

License:Apache License

/**
 * The main driver for sort program. Invoke this method to submit the
 * map/reduce job.// w w  w .j ava  2 s  .  co m
 * 
 * @throws IOException
 *             When there is communication problems with the job tracker.
 */

@SuppressWarnings("rawtypes")
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    JobClient client = new JobClient(conf);
    ClusterStatus cluster = client.getClusterStatus();
    int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9);
    String join_reduces = conf.get(REDUCES_PER_HOST);
    if (join_reduces != null) {
        num_reduces = cluster.getTaskTrackers() * Integer.parseInt(join_reduces);
    }
    Job job = Job.getInstance(conf);
    job.setJobName("join");
    job.setJarByClass(CompositeJoin.class);

    job.setMapperClass(Mapper.class);
    job.setReducerClass(Reducer.class);

    Class<? extends InputFormat> inputFormatClass = KeyValueTextInputFormat.class;// SequenceFileInputFormat.class;
    Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class;
    Class<? extends WritableComparable> outputKeyClass = Text.class;// BytesWritable.class;
    Class<? extends Writable> outputValueClass = Text.class;//TupleWritable.class;
    String op = "inner";
    List<String> otherArgs = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-r".equals(args[i])) {
                num_reduces = Integer.parseInt(args[++i]);
            } else if ("-inFormat".equals(args[i])) {
                inputFormatClass = Class.forName(args[++i]).asSubclass(InputFormat.class);
            } else if ("-outFormat".equals(args[i])) {
                outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class);
            } else if ("-outKey".equals(args[i])) {
                outputKeyClass = Class.forName(args[++i]).asSubclass(WritableComparable.class);
            } else if ("-outValue".equals(args[i])) {
                outputValueClass = Class.forName(args[++i]).asSubclass(Writable.class);
            } else if ("-joinOp".equals(args[i])) {
                op = args[++i];
            } else {
                otherArgs.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage(); // exits
        }
    }

    // Set user-supplied (possibly default) job configs
    job.setNumReduceTasks(num_reduces);

    if (otherArgs.size() < 2) {
        System.out.println("ERROR: Wrong number of parameters: ");
        return printUsage();
    }

    String strOut = otherArgs.remove(otherArgs.size() - 1);
    FileSystem.get(new Configuration()).delete(new Path(strOut), true);

    FileOutputFormat.setOutputPath(job, new Path(strOut));
    List<Path> plist = new ArrayList<Path>(otherArgs.size());
    for (String s : otherArgs) {
        plist.add(new Path(s));
    }

    job.setInputFormatClass(CompositeInputFormat.class);
    job.getConfiguration().set(CompositeInputFormat.JOIN_EXPR,
            CompositeInputFormat.compose(op, inputFormatClass, plist.toArray(new Path[0])));
    job.setOutputFormatClass(outputFormatClass);

    job.setMapperClass(MapComposite.class);

    job.setOutputKeyClass(outputKeyClass);
    job.setOutputValueClass(outputValueClass);

    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    int ret = job.waitForCompletion(true) ? 0 : 1;
    Date end_time = new Date();
    System.out.println("Job ended: " + end_time);
    System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds.");
    return ret;
}

From source file:com.github.dryangkun.hbase.tidx.hive.HBaseStorageHandler.java

License:Apache License

@Override
public void configureJobConf(TableDesc tableDesc, JobConf jobConf) {
    try {/* w  ww . j av a  2s . c  o  m*/
        HBaseSerDe.configureJobConf(tableDesc, jobConf);
        /*
         * HIVE-6356
         * The following code change is only needed for hbase-0.96.0 due to HBASE-9165, and
         * will not be required once Hive bumps up its hbase version). At that time , we will
         * only need TableMapReduceUtil.addDependencyJars(jobConf) here.
         */
        if (counterClass != null) {
            TableMapReduceUtil.addDependencyJars(jobConf, HBaseStorageHandler.class, TableInputFormatBase.class,
                    counterClass);
        } else {
            TableMapReduceUtil.addDependencyJars(jobConf, HBaseStorageHandler.class,
                    TableInputFormatBase.class);
        }
        if (HiveConf.getVar(jobConf, HiveConf.ConfVars.HIVE_HBASE_SNAPSHOT_NAME) != null) {
            // There is an extra dependency on MetricsRegistry for snapshot IF.
            TableMapReduceUtil.addDependencyJars(jobConf, MetricsRegistry.class);
        }
        Set<String> merged = new LinkedHashSet<String>(jobConf.getStringCollection("tmpjars"));

        Job copy = new Job(jobConf);
        TableMapReduceUtil.addDependencyJars(copy);
        merged.addAll(copy.getConfiguration().getStringCollection("tmpjars"));
        jobConf.set("tmpjars", StringUtils.arrayToString(merged.toArray(new String[0])));

        // Get credentials using the configuration instance which has HBase properties
        JobConf hbaseJobConf = new JobConf(getConf());
        org.apache.hadoop.hbase.mapred.TableMapReduceUtil.initCredentials(hbaseJobConf);
        ShimLoader.getHadoopShims().mergeCredentials(jobConf, hbaseJobConf);
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}

From source file:com.github.dryangkun.hbase.tidx.hive.HiveHBaseTableInputFormat.java

License:Apache License

@Override
public RecordReader<ImmutableBytesWritable, ResultWritable> getRecordReader(InputSplit split, JobConf jobConf,
        final Reporter reporter) throws IOException {

    HBaseSplit hbaseSplit = (HBaseSplit) split;
    TableSplit tableSplit = hbaseSplit.getTableSplit();

    Job job = new Job(jobConf);
    TaskAttemptContext tac = ShimLoader.getHadoopShims().newTaskAttemptContext(job.getConfiguration(),
            reporter);//from   www.  j  a v  a2s.  c  om

    final org.apache.hadoop.mapreduce.RecordReader<ImmutableBytesWritable, Result> recordReader;
    if (hbaseSplit.isTxIndexScan()) {
        LOG.info("getRecordReader: TxHiveIndexScan -> " + tableSplit);
        recordReader = TxHiveTableInputFormatUtil.createRecordReader(tableSplit, tac, jobConf);
    } else {
        LOG.info("getRecordReader: no TxHiveIndexScan -> " + tableSplit);
        setHTable(HiveHBaseInputFormatUtil.getTable(jobConf));
        setScan(HiveHBaseInputFormatUtil.getScan(jobConf));
        recordReader = createRecordReader(tableSplit, tac);
    }
    try {
        recordReader.initialize(tableSplit, tac);
    } catch (InterruptedException e) {
        throw new IOException("Failed to initialize RecordReader", e);
    }

    return new RecordReader<ImmutableBytesWritable, ResultWritable>() {

        @Override
        public void close() throws IOException {
            recordReader.close();
            closeTable();
        }

        @Override
        public ImmutableBytesWritable createKey() {
            return new ImmutableBytesWritable();
        }

        @Override
        public ResultWritable createValue() {
            return new ResultWritable(new Result());
        }

        @Override
        public long getPos() throws IOException {
            return 0;
        }

        @Override
        public float getProgress() throws IOException {
            float progress = 0.0F;
            try {
                progress = recordReader.getProgress();
            } catch (InterruptedException e) {
                throw new IOException(e);
            }
            return progress;
        }

        @Override
        public boolean next(ImmutableBytesWritable rowKey, ResultWritable value) throws IOException {
            boolean next = false;
            try {
                next = recordReader.nextKeyValue();
                if (next) {
                    rowKey.set(recordReader.getCurrentValue().getRow());
                    value.setResult(recordReader.getCurrentValue());
                }
            } catch (InterruptedException e) {
                throw new IOException(e);
            }
            return next;
        }
    };
}

From source file:com.github.dryangkun.hbase.tidx.hive.HiveHFileOutputFormat.java

License:Apache License

@Override
public RecordWriter getHiveRecordWriter(final JobConf jc, final Path finalOutPath,
        Class<? extends Writable> valueClass, boolean isCompressed, Properties tableProperties,
        final Progressable progressable) throws IOException {

    // Read configuration for the target path, first from jobconf, then from table properties
    String hfilePath = getFamilyPath(jc, tableProperties);
    if (hfilePath == null) {
        throw new RuntimeException("Please set " + HFILE_FAMILY_PATH + " to target location for HFiles");
    }//from w  w w.j  a  va  2  s  . c om

    // Target path's last component is also the column family name.
    final Path columnFamilyPath = new Path(hfilePath);
    final String columnFamilyName = columnFamilyPath.getName();
    final byte[] columnFamilyNameBytes = Bytes.toBytes(columnFamilyName);
    final Job job = new Job(jc);
    setCompressOutput(job, isCompressed);
    setOutputPath(job, finalOutPath);

    // Create the HFile writer
    final org.apache.hadoop.mapreduce.TaskAttemptContext tac = ShimLoader.getHadoopShims()
            .newTaskAttemptContext(job.getConfiguration(), progressable);

    final Path outputdir = FileOutputFormat.getOutputPath(tac);
    final org.apache.hadoop.mapreduce.RecordWriter<ImmutableBytesWritable, KeyValue> fileWriter = getFileWriter(
            tac);

    // Individual columns are going to be pivoted to HBase cells,
    // and for each row, they need to be written out in order
    // of column name, so sort the column names now, creating a
    // mapping to their column position.  However, the first
    // column is interpreted as the row key.
    String columnList = tableProperties.getProperty("columns");
    String[] columnArray = columnList.split(",");
    final SortedMap<byte[], Integer> columnMap = new TreeMap<byte[], Integer>(Bytes.BYTES_COMPARATOR);
    int i = 0;
    for (String columnName : columnArray) {
        if (i != 0) {
            columnMap.put(Bytes.toBytes(columnName), i);
        }
        ++i;
    }

    return new RecordWriter() {

        @Override
        public void close(boolean abort) throws IOException {
            try {
                fileWriter.close(null);
                if (abort) {
                    return;
                }
                // Move the hfiles file(s) from the task output directory to the
                // location specified by the user.
                FileSystem fs = outputdir.getFileSystem(jc);
                fs.mkdirs(columnFamilyPath);
                Path srcDir = outputdir;
                for (;;) {
                    FileStatus[] files = fs.listStatus(srcDir, FileUtils.STAGING_DIR_PATH_FILTER);
                    if ((files == null) || (files.length == 0)) {
                        throw new IOException("No family directories found in " + srcDir);
                    }
                    if (files.length != 1) {
                        throw new IOException("Multiple family directories found in " + srcDir);
                    }
                    srcDir = files[0].getPath();
                    if (srcDir.getName().equals(columnFamilyName)) {
                        break;
                    }
                }
                for (FileStatus regionFile : fs.listStatus(srcDir, FileUtils.STAGING_DIR_PATH_FILTER)) {
                    fs.rename(regionFile.getPath(), new Path(columnFamilyPath, regionFile.getPath().getName()));
                }
                // Hive actually wants a file as task output (not a directory), so
                // replace the empty directory with an empty file to keep it happy.
                fs.delete(outputdir, true);
                fs.createNewFile(outputdir);
            } catch (InterruptedException ex) {
                throw new IOException(ex);
            }
        }

        private void writeText(Text text) throws IOException {
            // Decompose the incoming text row into fields.
            String s = text.toString();
            String[] fields = s.split("\u0001");
            assert (fields.length <= (columnMap.size() + 1));
            // First field is the row key.
            byte[] rowKeyBytes = Bytes.toBytes(fields[0]);
            // Remaining fields are cells addressed by column name within row.
            for (Map.Entry<byte[], Integer> entry : columnMap.entrySet()) {
                byte[] columnNameBytes = entry.getKey();
                int iColumn = entry.getValue();
                String val;
                if (iColumn >= fields.length) {
                    // trailing blank field
                    val = "";
                } else {
                    val = fields[iColumn];
                    if ("\\N".equals(val)) {
                        // omit nulls
                        continue;
                    }
                }
                byte[] valBytes = Bytes.toBytes(val);
                KeyValue kv = new KeyValue(rowKeyBytes, columnFamilyNameBytes, columnNameBytes, valBytes);
                try {
                    fileWriter.write(null, kv);
                } catch (IOException e) {
                    LOG.error("Failed while writing row: " + s);
                    throw e;
                } catch (InterruptedException ex) {
                    throw new IOException(ex);
                }
            }
        }

        private void writePut(PutWritable put) throws IOException {
            ImmutableBytesWritable row = new ImmutableBytesWritable(put.getPut().getRow());
            SortedMap<byte[], List<Cell>> cells = put.getPut().getFamilyCellMap();
            for (Map.Entry<byte[], List<Cell>> entry : cells.entrySet()) {
                Collections.sort(entry.getValue(), new CellComparator());
                for (Cell c : entry.getValue()) {
                    try {
                        fileWriter.write(row, KeyValueUtil.copyToNewKeyValue(c));
                    } catch (InterruptedException e) {
                        throw (InterruptedIOException) new InterruptedIOException().initCause(e);
                    }
                }
            }
        }

        @Override
        public void write(Writable w) throws IOException {
            if (w instanceof Text) {
                writeText((Text) w);
            } else if (w instanceof PutWritable) {
                writePut((PutWritable) w);
            } else {
                throw new IOException("Unexpected writable " + w);
            }
        }
    };
}

From source file:com.github.libsml.commons.util.HadoopUtils.java

License:Apache License

/**
 * Create a map-only Hadoop Job out of the passed in parameters.  Does not set the
 * Job name.//from w w  w.  j  ava2s. c om
 *
 * @see #getCustomJobName(String, JobContext, Class, Class)
 */
public static Job prepareJob(Path inputPath, Path outputPath, Class<? extends InputFormat> inputFormat,
        Class<? extends Mapper> mapper, Class<? extends Writable> mapperKey,
        Class<? extends Writable> mapperValue, Class<? extends OutputFormat> outputFormat, Configuration conf)
        throws IOException {

    //    Job job = new Job(new Configuration(conf));
    Job job = Job.getInstance(conf);
    Configuration jobConf = job.getConfiguration();

    if (mapper.equals(Mapper.class)) {
        throw new IllegalStateException("Can't figure out the user class jar file from mapper/reducer");
    }
    job.setJarByClass(mapper);

    job.setInputFormatClass(inputFormat);
    jobConf.set("mapred.input.dir", inputPath.toString());

    job.setMapperClass(mapper);
    job.setMapOutputKeyClass(mapperKey);
    job.setMapOutputValueClass(mapperValue);
    job.setOutputKeyClass(mapperKey);
    job.setOutputValueClass(mapperValue);
    jobConf.setBoolean("mapred.compress.map.output", true);
    job.setNumReduceTasks(0);

    job.setOutputFormatClass(outputFormat);
    jobConf.set("mapred.output.dir", outputPath.toString());

    return job;
}

From source file:com.github.libsml.commons.util.HadoopUtils.java

License:Apache License

/**
 *
 * @param inputPaths/* w  w  w  . ja  v a 2s. com*/
 * @param outputPath
 * @param inputFormat
 * @param inputKey
 * @param inputValue
 * @param mapper
 * @param mapperKey
 * @param mapperValue
 * @param combiner
 * @param reducer
 * @param outputKey
 * @param outputValue
 * @param outputFormat
 * @param conf
 * @param overwrite
 * @param isCompress
 * @return
 * @throws IOException
 */
public static Job prepareAvroJob(String inputPaths, String outputPath, Class<? extends InputFormat> inputFormat,
        Object inputKey, Object inputValue, Class<? extends Mapper> mapper, Object mapperKey,
        Object mapperValue, Class<? extends Reducer> combiner, Class<? extends Reducer> reducer,
        Object outputKey, Object outputValue, Class<? extends OutputFormat> outputFormat, Configuration conf,
        boolean overwrite, boolean isCompress) throws IOException {

    Job job = Job.getInstance(conf);
    Configuration jobConf = job.getConfiguration();
    if (inputKey instanceof Schema) {

        if (inputValue instanceof Schema) {
            inputFormat = inputFormat == null ? AvroKeyValueInputFormat.class : inputFormat;
        }
        inputFormat = inputFormat == null ? AvroKeyInputFormat.class : inputFormat;

    }
    if (inputFormat != null) {
        job.setInputFormatClass(inputFormat);
    }

    if (inputKey instanceof Schema) {
        AvroJob.setInputKeySchema(job, (Schema) inputKey);
    }

    if (inputValue instanceof Schema) {
        AvroJob.setInputValueSchema(job, (Schema) inputValue);
    }

    if (outputKey instanceof Schema) {

        if (outputValue instanceof Schema) {
            outputFormat = outputFormat == null ? AvroKeyValueOutputFormat.class : outputFormat;
        }
        outputFormat = outputFormat == null ? AvroKeyOutputFormat.class : outputFormat;

    }
    if (outputFormat != null) {
        job.setOutputFormatClass(outputFormat);
    }

    if (outputKey instanceof Schema) {
        AvroJob.setOutputKeySchema(job, (Schema) outputKey);
    } else if (outputKey instanceof Class) {
        job.setOutputKeyClass((Class) outputKey);
    }

    if (outputValue instanceof Schema) {
        AvroJob.setOutputValueSchema(job, (Schema) outputValue);
    } else if (outputValue instanceof Class) {
        job.setOutputValueClass((Class) outputValue);
    }

    if (reducer == null) {
        job.setNumReduceTasks(0);

        if (mapperKey instanceof Schema) {
            AvroJob.setMapOutputKeySchema(job, (Schema) mapperKey);
        } else if (mapperKey instanceof Class) {
            job.setOutputKeyClass((Class) mapperKey);
        }

        if (mapperValue instanceof Schema) {
            AvroJob.setOutputValueSchema(job, (Schema) mapperValue);
        } else if (mapperKey instanceof Class) {
            job.setOutputValueClass((Class) mapperValue);
        }
        job.setJarByClass(mapper);

    } else if (reducer.equals(Reducer.class)) {
        if (mapper.equals(Mapper.class)) {
            throw new IllegalStateException("Can't figure out the user class jar file from mapper/reducer");
        }
        job.setJarByClass(mapper);

    } else {
        job.setJarByClass(reducer);

    }

    FileInputFormat.setInputPaths(job, inputPaths);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    if (isCompress) {
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, DeflateCodec.class);
    }

    job.setMapperClass(mapper);
    if (mapperKey instanceof Schema) {
        AvroJob.setMapOutputKeySchema(job, (Schema) mapperKey);
    } else if (mapperKey instanceof Class) {
        job.setMapOutputKeyClass((Class) mapperKey);
    }

    if (mapperValue instanceof Schema) {
        AvroJob.setMapOutputValueSchema(job, (Schema) mapperValue);
    } else if (mapperKey instanceof Class) {
        job.setMapOutputValueClass((Class) mapperValue);
    }

    if (reducer != null) {
        job.setReducerClass(reducer);
    }
    if (combiner != null) {
        job.setCombinerClass(combiner);
    }

    if (overwrite) {
        HadoopUtils.delete(jobConf, new Path(outputPath));
    }

    return job;

}