Example usage for org.apache.hadoop.mapred InputSplit getLength

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred InputSplit getLength.

Prototype

long getLength() throws IOException;

Source Link

Document

Get the total number of bytes in the data of the InputSplit.

Usage

From source file:DeprecatedBAMRecordReader.java

License:Open Source License

public DeprecatedBAMRecordReader(InputSplit split, final JobConf job, Reporter reporter) throws IOException {
    if (split instanceof DeprecatedFileVirtualSplit) {
        rr.initialize(((DeprecatedFileVirtualSplit) split).vs, new FakeTaskAttemptContext(job));

        splitLength = split.getLength();
        return;//from   w w  w .j  av a2s .  c om

    }
    if (split instanceof FileSplit) {
        // XXX             XXX
        //     XXX     XXX
        //         XXX
        //     XXX     XXX
        // XXX             XXX
        //
        // Hive gives us its own custom FileSplits for some reason, so we have
        // to do our own split alignment. (Sometimes, anyway; for "select
        // count(*) from table" we get FileSplits here, but for "select * from
        // table" our input format is used directly. Perhaps it's only because
        // the latter doesn't spawn a MapReduce job, so getting a FileSplit
        // here is the common case.)
        //
        // Since we get only one split at a time here, this is very poor: we
        // have to open the file for every split, even if it's the same file
        // every time.
        //
        // This should always work, but might be /very/ slow. I can't think of
        // a better way.

        final FileSplit fspl = (FileSplit) split;
        final Path path = fspl.getPath();

        final long beg = fspl.getStart();
        final long end = beg + fspl.getLength();

        final SeekableStream sin = WrapSeekable.openPath(path.getFileSystem(job), path);
        final BAMSplitGuesser guesser = new BAMSplitGuesser(sin);

        final long alignedBeg = guesser.guessNextBAMRecordStart(beg, end);
        sin.close();

        if (alignedBeg == end)
            throw new IOException("Guesser found nothing after pos " + beg);

        final long alignedEnd = end << 16 | 0xffff;
        splitLength = (alignedEnd - alignedBeg) >> 16;

        rr.initialize(new FileVirtualSplit(path, alignedBeg, alignedEnd, fspl.getLocations()),
                new FakeTaskAttemptContext(job));
        return;
    }

    throw new ClassCastException("Can only handle DeprecatedFileVirtualSplit and FileSplit");
}

From source file:alluxio.hadoop.HadoopUtils.java

License:Apache License

/**
 * Returns a string representation of a {@link InputSplit}.
 *
 * @param is Hadoop {@link InputSplit}/*  ww  w. j  a va2s .  c om*/
 * @return its string representation
 */
public static String toStringHadoopInputSplit(InputSplit is) {
    StringBuilder sb = new StringBuilder("HadoopInputSplit: ");
    try {
        sb.append(" Length: ").append(is.getLength());
        sb.append(" , Locations: ");
        for (String loc : is.getLocations()) {
            sb.append(loc).append(" ; ");
        }
    } catch (IOException e) {
        LOG.error(e.getMessage());
    }
    return sb.toString();
}

From source file:com.ostor.dedup.hadoop.DedupObjectInputFormat.java

License:Open Source License

public RecordReader<LongWritable, BytesWritable> getRecordReader(InputSplit genericSplit, JobConf job,
        Reporter reporter) throws IOException {

    logger.info("Dump record split - " + genericSplit);
    logger.info("Total length - " + genericSplit.getLength());

    reporter.setStatus(genericSplit.toString());
    return new BinaryRecordReader(job, (FileSplit) genericSplit);
}

From source file:edu.uci.ics.hyracks.hadoop.compat.util.HadoopAdapter.java

License:Apache License

private long getInputSize(Object[] splits, JobConf conf) throws IOException, InterruptedException {
    long totalInputSize = 0;
    if (conf.getUseNewMapper()) {
        for (org.apache.hadoop.mapreduce.InputSplit split : (org.apache.hadoop.mapreduce.InputSplit[]) splits) {
            totalInputSize += split.getLength();
        }// ww w  . ja  va  2 s. c  om
    } else {
        for (InputSplit split : (InputSplit[]) splits) {
            totalInputSize += split.getLength();
        }
    }
    return totalInputSize;
}

From source file:org.apache.drill.exec.store.hive.HiveInputReader.java

License:Apache License

public static void main(String args[]) throws Exception {
    /*/*from  www  .j  a  v a 2s  .  co  m*/
        String[] columnNames = {"n_nationkey", "n_name", "n_regionkey",   "n_comment"};
        String[] columnTypes = {"bigint", "string", "bigint", "string"};
            
        List<FieldSchema> cols = Lists.newArrayList();
            
        for (int i = 0; i < columnNames.length; i++) {
          cols.add(new FieldSchema(columnNames[i], columnTypes[i], null));
        }
        String location = "file:///tmp/nation_s";
        String inputFormat = TextInputFormat.class.getCanonicalName();
        String serdeLib = LazySimpleSerDe.class.getCanonicalName();
    //    String inputFormat = HiveHBaseTableInputFormat.class.getCanonicalName();
    //    String serdeLib = HBaseSerDe.class.getCanonicalName();
        Map<String, String> serdeParams = new HashMap();
    //    serdeParams.put("serialization.format", "1");
    //    serdeParams.put("hbase.columns.mapping", ":key,f:name,f:regionkey,f:comment");
        serdeParams.put("serialization.format", "|");
        serdeParams.put("field.delim", "|");
            
            
        Map<String, String> tableParams = new HashMap();
        tableParams.put("hbase.table.name", "nation");
        SerDeInfo serDeInfo = new SerDeInfo(null, serdeLib, serdeParams);
        StorageDescriptor storageDescriptor = new StorageDescriptor(cols, location, inputFormat, null, false, -1, serDeInfo, null, null, null);
        Table table = new Table("table", "default", "sphillips", 0, 0, 0, storageDescriptor, new ArrayList<FieldSchema>(), tableParams, null, null, "MANAGED_TABLE");
        Properties properties = MetaStoreUtils.getTableMetadata(table);
        */

    HiveConf conf = new HiveConf();
    conf.set("hive.metastore.uris", "thrift://10.10.31.51:9083");
    HiveMetaStoreClient client = new HiveMetaStoreClient(conf);
    Table table = client.getTable("default", "nation");
    Properties properties = MetaStoreUtils.getTableMetadata(table);

    Path path = new Path(table.getSd().getLocation());
    JobConf job = new JobConf();
    for (Object obj : properties.keySet()) {
        job.set((String) obj, (String) properties.get(obj));
    }
    //    job.set("hbase.zookeeper.quorum", "10.10.31.51");
    //    job.set("hbase.zookeeper.property.clientPort", "5181");
    InputFormat f = (InputFormat) Class.forName(table.getSd().getInputFormat()).getConstructor().newInstance();
    job.setInputFormat(f.getClass());
    FileInputFormat.addInputPath(job, path);
    InputFormat format = job.getInputFormat();
    SerDe serde = (SerDe) Class.forName(table.getSd().getSerdeInfo().getSerializationLib()).getConstructor()
            .newInstance();
    serde.initialize(job, properties);
    ObjectInspector inspector = serde.getObjectInspector();
    ObjectInspector.Category cat = inspector.getCategory();
    TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromObjectInspector(inspector);
    List<String> columns = null;
    List<TypeInfo> colTypes = null;
    List<ObjectInspector> fieldObjectInspectors = Lists.newArrayList();

    switch (typeInfo.getCategory()) {
    case STRUCT:
        columns = ((StructTypeInfo) typeInfo).getAllStructFieldNames();
        colTypes = ((StructTypeInfo) typeInfo).getAllStructFieldTypeInfos();
        for (int i = 0; i < columns.size(); i++) {
            System.out.print(columns.get(i));
            System.out.print(" ");
            System.out.print(colTypes.get(i));
        }
        System.out.println("");
        for (StructField field : ((StructObjectInspector) inspector).getAllStructFieldRefs()) {
            fieldObjectInspectors.add(field.getFieldObjectInspector());
        }
    }

    for (InputSplit split : format.getSplits(job, 1)) {
        String encoded = serializeInputSplit(split);
        System.out.println(encoded);
        InputSplit newSplit = deserializeInputSplit(encoded, split.getClass().getCanonicalName());
        System.out.print("Length: " + newSplit.getLength() + " ");
        System.out.print("Locations: ");
        for (String loc : newSplit.getLocations())
            System.out.print(loc + " ");
        System.out.println();
    }

    for (InputSplit split : format.getSplits(job, 1)) {
        RecordReader reader = format.getRecordReader(split, job, Reporter.NULL);
        Object key = reader.createKey();
        Object value = reader.createValue();
        int count = 0;
        while (reader.next(key, value)) {
            List<Object> values = ((StructObjectInspector) inspector)
                    .getStructFieldsDataAsList(serde.deserialize((Writable) value));
            StructObjectInspector sInsp = (StructObjectInspector) inspector;
            Object obj = sInsp.getStructFieldData(serde.deserialize((Writable) value),
                    sInsp.getStructFieldRef("n_name"));
            System.out.println(obj);
            /*
            for (Object obj : values) {
              PrimitiveObjectInspector.PrimitiveCategory pCat = ((PrimitiveObjectInspector)fieldObjectInspectors.get(count)).getPrimitiveCategory();
              Object pObj = ((PrimitiveObjectInspector)fieldObjectInspectors.get(count)).getPrimitiveJavaObject(obj);
              System.out.print(pObj + " ");
            }
            */
            System.out.println("");
        }
    }
}

From source file:org.apache.hawq.pxf.plugins.hdfs.HdfsAnalyzer.java

License:Apache License

private ArrayList<InputSplit> getSplits(Path path) throws IOException {
    PxfInputFormat fformat = new PxfInputFormat();
    PxfInputFormat.setInputPaths(jobConf, path);
    InputSplit[] splits = fformat.getSplits(jobConf, 1);
    ArrayList<InputSplit> result = new ArrayList<InputSplit>();

    // remove empty splits
    if (splits != null) {
        for (InputSplit split : splits) {
            if (split.getLength() > 0) {
                result.add(split);/*from  www. j a v  a2  s  .  c  o m*/
            }
        }
    }

    return result;
}

From source file:org.apache.hawq.pxf.plugins.hdfs.HdfsDataFragmenter.java

License:Apache License

@Override
public FragmentsStats getFragmentsStats() throws Exception {
    String absoluteDataPath = HdfsUtilities.absoluteDataPath(inputData.getDataSource());
    ArrayList<InputSplit> splits = getSplits(new Path(absoluteDataPath));

    if (splits.isEmpty()) {
        return new FragmentsStats(0, 0, 0);
    }// w  w w. j  av a 2s .c  om
    long totalSize = 0;
    for (InputSplit split : splits) {
        totalSize += split.getLength();
    }
    InputSplit firstSplit = splits.get(0);
    return new FragmentsStats(splits.size(), firstSplit.getLength(), totalSize);
}

From source file:org.apache.hawq.pxf.plugins.hdfs.HdfsDataFragmenter.java

License:Apache License

private ArrayList<InputSplit> getSplits(Path path) throws IOException {
    PxfInputFormat fformat = new PxfInputFormat();
    PxfInputFormat.setInputPaths(jobConf, path);
    InputSplit[] splits = fformat.getSplits(jobConf, 1);
    ArrayList<InputSplit> result = new ArrayList<InputSplit>();

    /*/*from w w  w. java2 s .co  m*/
     * HD-2547: If the file is empty, an empty split is returned: no
     * locations and no length.
     */
    if (splits != null) {
        for (InputSplit split : splits) {
            if (split.getLength() > 0) {
                result.add(split);
            }
        }
    }

    return result;
}

From source file:org.apache.mahout.df.mapred.Builder.java

License:Apache License

/**
 * sort the splits into order based on size, so that the biggest go first.<br>
 * This is the same code used by Hadoop's JobClient.
 * // w  w  w.  j av  a  2s . co  m
 * @param splits
 */
public static void sortSplits(InputSplit[] splits) {
    Arrays.sort(splits, new Comparator<InputSplit>() {
        @Override
        public int compare(InputSplit a, InputSplit b) {
            try {
                long left = a.getLength();
                long right = b.getLength();
                if (left == right) {
                    return 0;
                } else if (left < right) {
                    return 1;
                } else {
                    return -1;
                }
            } catch (IOException ie) {
                throw new IllegalStateException("Problem getting input split size", ie);
            }
        }
    });
}

From source file:org.apache.tez.mapreduce.processor.MapUtils.java

License:Apache License

private static void writeSplitFiles(FileSystem fs, JobConf conf, InputSplit split) throws IOException {
    Path jobSplitFile = new Path(conf.get(MRFrameworkConfigs.TASK_LOCAL_RESOURCE_DIR,
            MRFrameworkConfigs.TASK_LOCAL_RESOURCE_DIR_DEFAULT), MRJobConfig.JOB_SPLIT);
    LOG.info("Writing split to: " + jobSplitFile);
    FSDataOutputStream out = FileSystem.create(fs, jobSplitFile, new FsPermission(JOB_FILE_PERMISSION));

    long offset = out.getPos();
    Text.writeString(out, split.getClass().getName());
    split.write(out);/*from   w w  w.  ja va 2 s  . c o m*/
    out.close();

    String[] locations = split.getLocations();

    SplitMetaInfo info = null;
    info = new JobSplit.SplitMetaInfo(locations, offset, split.getLength());

    Path jobSplitMetaInfoFile = new Path(conf.get(MRFrameworkConfigs.TASK_LOCAL_RESOURCE_DIR),
            MRJobConfig.JOB_SPLIT_METAINFO);

    FSDataOutputStream outMeta = FileSystem.create(fs, jobSplitMetaInfoFile,
            new FsPermission(JOB_FILE_PERMISSION));
    outMeta.write(SplitMetaInfoReaderTez.META_SPLIT_FILE_HEADER);
    WritableUtils.writeVInt(outMeta, SplitMetaInfoReaderTez.META_SPLIT_VERSION);
    WritableUtils.writeVInt(outMeta, 1); // Only 1 split meta info being written
    info.write(outMeta);
    outMeta.close();
}