Example usage for org.apache.hadoop.mapred InputSplit getLength

List of usage examples for org.apache.hadoop.mapred InputSplit getLength

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred InputSplit getLength.

Prototype

long getLength() throws IOException;

Source Link

Document

Get the total number of bytes in the data of the InputSplit.

Usage

From source file:DeprecatedBAMRecordReader.java

License:Open Source License

public DeprecatedBAMRecordReader(InputSplit split, final JobConf job, Reporter reporter) throws IOException {
    if (split instanceof DeprecatedFileVirtualSplit) {
        rr.initialize(((DeprecatedFileVirtualSplit) split).vs, new FakeTaskAttemptContext(job));

        splitLength = split.getLength();
        return;//from   w w  w .j  av a2s .  c om

    }
    if (split instanceof FileSplit) {
        // XXX             XXX
        //     XXX     XXX
        //         XXX
        //     XXX     XXX
        // XXX             XXX
        //
        // Hive gives us its own custom FileSplits for some reason, so we have
        // to do our own split alignment. (Sometimes, anyway; for "select
        // count(*) from table" we get FileSplits here, but for "select * from
        // table" our input format is used directly. Perhaps it's only because
        // the latter doesn't spawn a MapReduce job, so getting a FileSplit
        // here is the common case.)
        //
        // Since we get only one split at a time here, this is very poor: we
        // have to open the file for every split, even if it's the same file
        // every time.
        //
        // This should always work, but might be /very/ slow. I can't think of
        // a better way.

        final FileSplit fspl = (FileSplit) split;
        final Path path = fspl.getPath();

        final long beg = fspl.getStart();
        final long end = beg + fspl.getLength();

        final SeekableStream sin = WrapSeekable.openPath(path.getFileSystem(job), path);
        final BAMSplitGuesser guesser = new BAMSplitGuesser(sin);

        final long alignedBeg = guesser.guessNextBAMRecordStart(beg, end);
        sin.close();

        if (alignedBeg == end)
            throw new IOException("Guesser found nothing after pos " + beg);

        final long alignedEnd = end << 16 | 0xffff;
        splitLength = (alignedEnd - alignedBeg) >> 16;

        rr.initialize(new FileVirtualSplit(path, alignedBeg, alignedEnd, fspl.getLocations()),
                new FakeTaskAttemptContext(job));
        return;
    }

    throw new ClassCastException("Can only handle DeprecatedFileVirtualSplit and FileSplit");
}

From source file:alluxio.hadoop.HadoopUtils.java

License:Apache License

/**
 * Returns a string representation of a {@link InputSplit}.
 *
 * @param is Hadoop {@link InputSplit}/*  ww  w. j  a va2s .  c om*/
 * @return its string representation
 */
public static String toStringHadoopInputSplit(InputSplit is) {
    StringBuilder sb = new StringBuilder("HadoopInputSplit: ");
    try {
        sb.append(" Length: ").append(is.getLength());
        sb.append(" , Locations: ");
        for (String loc : is.getLocations()) {
            sb.append(loc).append(" ; ");
        }
    } catch (IOException e) {
        LOG.error(e.getMessage());
    }
    return sb.toString();
}

From source file:com.ostor.dedup.hadoop.DedupObjectInputFormat.java

License:Open Source License

public RecordReader<LongWritable, BytesWritable> getRecordReader(InputSplit genericSplit, JobConf job,
        Reporter reporter) throws IOException {

    logger.info("Dump record split - " + genericSplit);
    logger.info("Total length - " + genericSplit.getLength());

    reporter.setStatus(genericSplit.toString());
    return new BinaryRecordReader(job, (FileSplit) genericSplit);
}

From source file:edu.uci.ics.hyracks.hadoop.compat.util.HadoopAdapter.java

License:Apache License

private long getInputSize(Object[] splits, JobConf conf) throws IOException, InterruptedException {
    long totalInputSize = 0;
    if (conf.getUseNewMapper()) {
        for (org.apache.hadoop.mapreduce.InputSplit split : (org.apache.hadoop.mapreduce.InputSplit[]) splits) {
            totalInputSize += split.getLength();
        }// ww w  . ja  va  2 s. c  om
    } else {
        for (InputSplit split : (InputSplit[]) splits) {
            totalInputSize += split.getLength();
        }
    }
    return totalInputSize;
}

From source file:org.apache.drill.exec.store.hive.HiveInputReader.java

License:Apache License

public static void main(String args[]) throws Exception {
    /*/*from  www  .j  a  v a 2s  .  co  m*/
        String[] columnNames = {"n_nationkey", "n_name", "n_regionkey",   "n_comment"};
        String[] columnTypes = {"bigint", "string", "bigint", "string"};
            
        List<FieldSchema> cols = Lists.newArrayList();
            
        for (int i = 0; i < columnNames.length; i++) {
          cols.add(new FieldSchema(columnNames[i], columnTypes[i], null));
        }
        String location = "file:///tmp/nation_s";
        String inputFormat = TextInputFormat.class.getCanonicalName();
        String serdeLib = LazySimpleSerDe.class.getCanonicalName();
    //    String inputFormat = HiveHBaseTableInputFormat.class.getCanonicalName();
    //    String serdeLib = HBaseSerDe.class.getCanonicalName();
        Map<String, String> serdeParams = new HashMap();
    //    serdeParams.put("serialization.format", "1");
    //    serdeParams.put("hbase.columns.mapping", ":key,f:name,f:regionkey,f:comment");
        serdeParams.put("serialization.format", "|");
        serdeParams.put("field.delim", "|");
            
            
        Map<String, String> tableParams = new HashMap();
        tableParams.put("hbase.table.name", "nation");
        SerDeInfo serDeInfo = new SerDeInfo(null, serdeLib, serdeParams);
        StorageDescriptor storageDescriptor = new StorageDescriptor(cols, location, inputFormat, null, false, -1, serDeInfo, null, null, null);
        Table table = new Table("table", "default", "sphillips", 0, 0, 0, storageDescriptor, new ArrayList<FieldSchema>(), tableParams, null, null, "MANAGED_TABLE");
        Properties properties = MetaStoreUtils.getTableMetadata(table);
        */

    HiveConf conf = new HiveConf();
    conf.set("hive.metastore.uris", "thrift://10.10.31.51:9083");
    HiveMetaStoreClient client = new HiveMetaStoreClient(conf);
    Table table = client.getTable("default", "nation");
    Properties properties = MetaStoreUtils.getTableMetadata(table);

    Path path = new Path(table.getSd().getLocation());
    JobConf job = new JobConf();
    for (Object obj : properties.keySet()) {
        job.set((String) obj, (String) properties.get(obj));
    }
    //    job.set("hbase.zookeeper.quorum", "10.10.31.51");
    //    job.set("hbase.zookeeper.property.clientPort", "5181");
    InputFormat f = (InputFormat) Class.forName(table.getSd().getInputFormat()).getConstructor().newInstance();
    job.setInputFormat(f.getClass());
    FileInputFormat.addInputPath(job, path);
    InputFormat format = job.getInputFormat();
    SerDe serde = (SerDe) Class.forName(table.getSd().getSerdeInfo().getSerializationLib()).getConstructor()
            .newInstance();
    serde.initialize(job, properties);
    ObjectInspector inspector = serde.getObjectInspector();
    ObjectInspector.Category cat = inspector.getCategory();
    TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromObjectInspector(inspector);
    List<String> columns = null;
    List<TypeInfo> colTypes = null;
    List<ObjectInspector> fieldObjectInspectors = Lists.newArrayList();

    switch (typeInfo.getCategory()) {
    case STRUCT:
        columns = ((StructTypeInfo) typeInfo).getAllStructFieldNames();
        colTypes = ((StructTypeInfo) typeInfo).getAllStructFieldTypeInfos();
        for (int i = 0; i < columns.size(); i++) {
            System.out.print(columns.get(i));
            System.out.print(" ");
            System.out.print(colTypes.get(i));
        }
        System.out.println("");
        for (StructField field : ((StructObjectInspector) inspector).getAllStructFieldRefs()) {
            fieldObjectInspectors.add(field.getFieldObjectInspector());
        }
    }

    for (InputSplit split : format.getSplits(job, 1)) {
        String encoded = serializeInputSplit(split);
        System.out.println(encoded);
        InputSplit newSplit = deserializeInputSplit(encoded, split.getClass().getCanonicalName());
        System.out.print("Length: " + newSplit.getLength() + " ");
        System.out.print("Locations: ");
        for (String loc : newSplit.getLocations())
            System.out.print(loc + " ");
        System.out.println();
    }

    for (InputSplit split : format.getSplits(job, 1)) {
        RecordReader reader = format.getRecordReader(split, job, Reporter.NULL);
        Object key = reader.createKey();
        Object value = reader.createValue();
        int count = 0;
        while (reader.next(key, value)) {
            List<Object> values = ((StructObjectInspector) inspector)
                    .getStructFieldsDataAsList(serde.deserialize((Writable) value));
            StructObjectInspector sInsp = (StructObjectInspector) inspector;
            Object obj = sInsp.getStructFieldData(serde.deserialize((Writable) value),
                    sInsp.getStructFieldRef("n_name"));
            System.out.println(obj);
            /*
            for (Object obj : values) {
              PrimitiveObjectInspector.PrimitiveCategory pCat = ((PrimitiveObjectInspector)fieldObjectInspectors.get(count)).getPrimitiveCategory();
              Object pObj = ((PrimitiveObjectInspector)fieldObjectInspectors.get(count)).getPrimitiveJavaObject(obj);
              System.out.print(pObj + " ");
            }
            */
            System.out.println("");
        }
    }
}

From source file:org.apache.hawq.pxf.plugins.hdfs.HdfsAnalyzer.java

License:Apache License

private ArrayList<InputSplit> getSplits(Path path) throws IOException {
    PxfInputFormat fformat = new PxfInputFormat();
    PxfInputFormat.setInputPaths(jobConf, path);
    InputSplit[] splits = fformat.getSplits(jobConf, 1);
    ArrayList<InputSplit> result = new ArrayList<InputSplit>();

    // remove empty splits
    if (splits != null) {
        for (InputSplit split : splits) {
            if (split.getLength() > 0) {
                result.add(split);/*from  www. j a v  a2  s  .  c  o m*/
            }
        }
    }

    return result;
}

From source file:org.apache.hawq.pxf.plugins.hdfs.HdfsDataFragmenter.java

License:Apache License

@Override
public FragmentsStats getFragmentsStats() throws Exception {
    String absoluteDataPath = HdfsUtilities.absoluteDataPath(inputData.getDataSource());
    ArrayList<InputSplit> splits = getSplits(new Path(absoluteDataPath));

    if (splits.isEmpty()) {
        return new FragmentsStats(0, 0, 0);
    }// w  w w. j  av a 2s .c  om
    long totalSize = 0;
    for (InputSplit split : splits) {
        totalSize += split.getLength();
    }
    InputSplit firstSplit = splits.get(0);
    return new FragmentsStats(splits.size(), firstSplit.getLength(), totalSize);
}

From source file:org.apache.hawq.pxf.plugins.hdfs.HdfsDataFragmenter.java

License:Apache License

private ArrayList<InputSplit> getSplits(Path path) throws IOException {
    PxfInputFormat fformat = new PxfInputFormat();
    PxfInputFormat.setInputPaths(jobConf, path);
    InputSplit[] splits = fformat.getSplits(jobConf, 1);
    ArrayList<InputSplit> result = new ArrayList<InputSplit>();

    /*/*from w w  w. java2 s .co  m*/
     * HD-2547: If the file is empty, an empty split is returned: no
     * locations and no length.
     */
    if (splits != null) {
        for (InputSplit split : splits) {
            if (split.getLength() > 0) {
                result.add(split);
            }
        }
    }

    return result;
}

From source file:org.apache.mahout.df.mapred.Builder.java

License:Apache License

/**
 * sort the splits into order based on size, so that the biggest go first.<br>
 * This is the same code used by Hadoop's JobClient.
 * // w  w  w.  j av  a  2s . co  m
 * @param splits
 */
public static void sortSplits(InputSplit[] splits) {
    Arrays.sort(splits, new Comparator<InputSplit>() {
        @Override
        public int compare(InputSplit a, InputSplit b) {
            try {
                long left = a.getLength();
                long right = b.getLength();
                if (left == right) {
                    return 0;
                } else if (left < right) {
                    return 1;
                } else {
                    return -1;
                }
            } catch (IOException ie) {
                throw new IllegalStateException("Problem getting input split size", ie);
            }
        }
    });
}

From source file:org.apache.tez.mapreduce.processor.MapUtils.java

License:Apache License

private static void writeSplitFiles(FileSystem fs, JobConf conf, InputSplit split) throws IOException {
    Path jobSplitFile = new Path(conf.get(MRFrameworkConfigs.TASK_LOCAL_RESOURCE_DIR,
            MRFrameworkConfigs.TASK_LOCAL_RESOURCE_DIR_DEFAULT), MRJobConfig.JOB_SPLIT);
    LOG.info("Writing split to: " + jobSplitFile);
    FSDataOutputStream out = FileSystem.create(fs, jobSplitFile, new FsPermission(JOB_FILE_PERMISSION));

    long offset = out.getPos();
    Text.writeString(out, split.getClass().getName());
    split.write(out);/*from   w w  w.  ja va 2 s  . c o m*/
    out.close();

    String[] locations = split.getLocations();

    SplitMetaInfo info = null;
    info = new JobSplit.SplitMetaInfo(locations, offset, split.getLength());

    Path jobSplitMetaInfoFile = new Path(conf.get(MRFrameworkConfigs.TASK_LOCAL_RESOURCE_DIR),
            MRJobConfig.JOB_SPLIT_METAINFO);

    FSDataOutputStream outMeta = FileSystem.create(fs, jobSplitMetaInfoFile,
            new FsPermission(JOB_FILE_PERMISSION));
    outMeta.write(SplitMetaInfoReaderTez.META_SPLIT_FILE_HEADER);
    WritableUtils.writeVInt(outMeta, SplitMetaInfoReaderTez.META_SPLIT_VERSION);
    WritableUtils.writeVInt(outMeta, 1); // Only 1 split meta info being written
    info.write(outMeta);
    outMeta.close();
}