List of usage examples for org.apache.hadoop.mapred InputSplit getLength
long getLength() throws IOException;
InputSplit
. From source file:DeprecatedBAMRecordReader.java
License:Open Source License
public DeprecatedBAMRecordReader(InputSplit split, final JobConf job, Reporter reporter) throws IOException { if (split instanceof DeprecatedFileVirtualSplit) { rr.initialize(((DeprecatedFileVirtualSplit) split).vs, new FakeTaskAttemptContext(job)); splitLength = split.getLength(); return;//from w w w .j av a2s . c om } if (split instanceof FileSplit) { // XXX XXX // XXX XXX // XXX // XXX XXX // XXX XXX // // Hive gives us its own custom FileSplits for some reason, so we have // to do our own split alignment. (Sometimes, anyway; for "select // count(*) from table" we get FileSplits here, but for "select * from // table" our input format is used directly. Perhaps it's only because // the latter doesn't spawn a MapReduce job, so getting a FileSplit // here is the common case.) // // Since we get only one split at a time here, this is very poor: we // have to open the file for every split, even if it's the same file // every time. // // This should always work, but might be /very/ slow. I can't think of // a better way. final FileSplit fspl = (FileSplit) split; final Path path = fspl.getPath(); final long beg = fspl.getStart(); final long end = beg + fspl.getLength(); final SeekableStream sin = WrapSeekable.openPath(path.getFileSystem(job), path); final BAMSplitGuesser guesser = new BAMSplitGuesser(sin); final long alignedBeg = guesser.guessNextBAMRecordStart(beg, end); sin.close(); if (alignedBeg == end) throw new IOException("Guesser found nothing after pos " + beg); final long alignedEnd = end << 16 | 0xffff; splitLength = (alignedEnd - alignedBeg) >> 16; rr.initialize(new FileVirtualSplit(path, alignedBeg, alignedEnd, fspl.getLocations()), new FakeTaskAttemptContext(job)); return; } throw new ClassCastException("Can only handle DeprecatedFileVirtualSplit and FileSplit"); }
From source file:alluxio.hadoop.HadoopUtils.java
License:Apache License
/** * Returns a string representation of a {@link InputSplit}. * * @param is Hadoop {@link InputSplit}/* ww w. j a va2s . c om*/ * @return its string representation */ public static String toStringHadoopInputSplit(InputSplit is) { StringBuilder sb = new StringBuilder("HadoopInputSplit: "); try { sb.append(" Length: ").append(is.getLength()); sb.append(" , Locations: "); for (String loc : is.getLocations()) { sb.append(loc).append(" ; "); } } catch (IOException e) { LOG.error(e.getMessage()); } return sb.toString(); }
From source file:com.ostor.dedup.hadoop.DedupObjectInputFormat.java
License:Open Source License
public RecordReader<LongWritable, BytesWritable> getRecordReader(InputSplit genericSplit, JobConf job, Reporter reporter) throws IOException { logger.info("Dump record split - " + genericSplit); logger.info("Total length - " + genericSplit.getLength()); reporter.setStatus(genericSplit.toString()); return new BinaryRecordReader(job, (FileSplit) genericSplit); }
From source file:edu.uci.ics.hyracks.hadoop.compat.util.HadoopAdapter.java
License:Apache License
private long getInputSize(Object[] splits, JobConf conf) throws IOException, InterruptedException { long totalInputSize = 0; if (conf.getUseNewMapper()) { for (org.apache.hadoop.mapreduce.InputSplit split : (org.apache.hadoop.mapreduce.InputSplit[]) splits) { totalInputSize += split.getLength(); }// ww w . ja va 2 s. c om } else { for (InputSplit split : (InputSplit[]) splits) { totalInputSize += split.getLength(); } } return totalInputSize; }
From source file:org.apache.drill.exec.store.hive.HiveInputReader.java
License:Apache License
public static void main(String args[]) throws Exception { /*/*from www .j a v a 2s . co m*/ String[] columnNames = {"n_nationkey", "n_name", "n_regionkey", "n_comment"}; String[] columnTypes = {"bigint", "string", "bigint", "string"}; List<FieldSchema> cols = Lists.newArrayList(); for (int i = 0; i < columnNames.length; i++) { cols.add(new FieldSchema(columnNames[i], columnTypes[i], null)); } String location = "file:///tmp/nation_s"; String inputFormat = TextInputFormat.class.getCanonicalName(); String serdeLib = LazySimpleSerDe.class.getCanonicalName(); // String inputFormat = HiveHBaseTableInputFormat.class.getCanonicalName(); // String serdeLib = HBaseSerDe.class.getCanonicalName(); Map<String, String> serdeParams = new HashMap(); // serdeParams.put("serialization.format", "1"); // serdeParams.put("hbase.columns.mapping", ":key,f:name,f:regionkey,f:comment"); serdeParams.put("serialization.format", "|"); serdeParams.put("field.delim", "|"); Map<String, String> tableParams = new HashMap(); tableParams.put("hbase.table.name", "nation"); SerDeInfo serDeInfo = new SerDeInfo(null, serdeLib, serdeParams); StorageDescriptor storageDescriptor = new StorageDescriptor(cols, location, inputFormat, null, false, -1, serDeInfo, null, null, null); Table table = new Table("table", "default", "sphillips", 0, 0, 0, storageDescriptor, new ArrayList<FieldSchema>(), tableParams, null, null, "MANAGED_TABLE"); Properties properties = MetaStoreUtils.getTableMetadata(table); */ HiveConf conf = new HiveConf(); conf.set("hive.metastore.uris", "thrift://10.10.31.51:9083"); HiveMetaStoreClient client = new HiveMetaStoreClient(conf); Table table = client.getTable("default", "nation"); Properties properties = MetaStoreUtils.getTableMetadata(table); Path path = new Path(table.getSd().getLocation()); JobConf job = new JobConf(); for (Object obj : properties.keySet()) { job.set((String) obj, (String) properties.get(obj)); } // job.set("hbase.zookeeper.quorum", "10.10.31.51"); // job.set("hbase.zookeeper.property.clientPort", "5181"); InputFormat f = (InputFormat) Class.forName(table.getSd().getInputFormat()).getConstructor().newInstance(); job.setInputFormat(f.getClass()); FileInputFormat.addInputPath(job, path); InputFormat format = job.getInputFormat(); SerDe serde = (SerDe) Class.forName(table.getSd().getSerdeInfo().getSerializationLib()).getConstructor() .newInstance(); serde.initialize(job, properties); ObjectInspector inspector = serde.getObjectInspector(); ObjectInspector.Category cat = inspector.getCategory(); TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromObjectInspector(inspector); List<String> columns = null; List<TypeInfo> colTypes = null; List<ObjectInspector> fieldObjectInspectors = Lists.newArrayList(); switch (typeInfo.getCategory()) { case STRUCT: columns = ((StructTypeInfo) typeInfo).getAllStructFieldNames(); colTypes = ((StructTypeInfo) typeInfo).getAllStructFieldTypeInfos(); for (int i = 0; i < columns.size(); i++) { System.out.print(columns.get(i)); System.out.print(" "); System.out.print(colTypes.get(i)); } System.out.println(""); for (StructField field : ((StructObjectInspector) inspector).getAllStructFieldRefs()) { fieldObjectInspectors.add(field.getFieldObjectInspector()); } } for (InputSplit split : format.getSplits(job, 1)) { String encoded = serializeInputSplit(split); System.out.println(encoded); InputSplit newSplit = deserializeInputSplit(encoded, split.getClass().getCanonicalName()); System.out.print("Length: " + newSplit.getLength() + " "); System.out.print("Locations: "); for (String loc : newSplit.getLocations()) System.out.print(loc + " "); System.out.println(); } for (InputSplit split : format.getSplits(job, 1)) { RecordReader reader = format.getRecordReader(split, job, Reporter.NULL); Object key = reader.createKey(); Object value = reader.createValue(); int count = 0; while (reader.next(key, value)) { List<Object> values = ((StructObjectInspector) inspector) .getStructFieldsDataAsList(serde.deserialize((Writable) value)); StructObjectInspector sInsp = (StructObjectInspector) inspector; Object obj = sInsp.getStructFieldData(serde.deserialize((Writable) value), sInsp.getStructFieldRef("n_name")); System.out.println(obj); /* for (Object obj : values) { PrimitiveObjectInspector.PrimitiveCategory pCat = ((PrimitiveObjectInspector)fieldObjectInspectors.get(count)).getPrimitiveCategory(); Object pObj = ((PrimitiveObjectInspector)fieldObjectInspectors.get(count)).getPrimitiveJavaObject(obj); System.out.print(pObj + " "); } */ System.out.println(""); } } }
From source file:org.apache.hawq.pxf.plugins.hdfs.HdfsAnalyzer.java
License:Apache License
private ArrayList<InputSplit> getSplits(Path path) throws IOException { PxfInputFormat fformat = new PxfInputFormat(); PxfInputFormat.setInputPaths(jobConf, path); InputSplit[] splits = fformat.getSplits(jobConf, 1); ArrayList<InputSplit> result = new ArrayList<InputSplit>(); // remove empty splits if (splits != null) { for (InputSplit split : splits) { if (split.getLength() > 0) { result.add(split);/*from www. j a v a2 s . c o m*/ } } } return result; }
From source file:org.apache.hawq.pxf.plugins.hdfs.HdfsDataFragmenter.java
License:Apache License
@Override public FragmentsStats getFragmentsStats() throws Exception { String absoluteDataPath = HdfsUtilities.absoluteDataPath(inputData.getDataSource()); ArrayList<InputSplit> splits = getSplits(new Path(absoluteDataPath)); if (splits.isEmpty()) { return new FragmentsStats(0, 0, 0); }// w w w. j av a 2s .c om long totalSize = 0; for (InputSplit split : splits) { totalSize += split.getLength(); } InputSplit firstSplit = splits.get(0); return new FragmentsStats(splits.size(), firstSplit.getLength(), totalSize); }
From source file:org.apache.hawq.pxf.plugins.hdfs.HdfsDataFragmenter.java
License:Apache License
private ArrayList<InputSplit> getSplits(Path path) throws IOException { PxfInputFormat fformat = new PxfInputFormat(); PxfInputFormat.setInputPaths(jobConf, path); InputSplit[] splits = fformat.getSplits(jobConf, 1); ArrayList<InputSplit> result = new ArrayList<InputSplit>(); /*/*from w w w. java2 s .co m*/ * HD-2547: If the file is empty, an empty split is returned: no * locations and no length. */ if (splits != null) { for (InputSplit split : splits) { if (split.getLength() > 0) { result.add(split); } } } return result; }
From source file:org.apache.mahout.df.mapred.Builder.java
License:Apache License
/** * sort the splits into order based on size, so that the biggest go first.<br> * This is the same code used by Hadoop's JobClient. * // w w w. j av a 2s . co m * @param splits */ public static void sortSplits(InputSplit[] splits) { Arrays.sort(splits, new Comparator<InputSplit>() { @Override public int compare(InputSplit a, InputSplit b) { try { long left = a.getLength(); long right = b.getLength(); if (left == right) { return 0; } else if (left < right) { return 1; } else { return -1; } } catch (IOException ie) { throw new IllegalStateException("Problem getting input split size", ie); } } }); }
From source file:org.apache.tez.mapreduce.processor.MapUtils.java
License:Apache License
private static void writeSplitFiles(FileSystem fs, JobConf conf, InputSplit split) throws IOException { Path jobSplitFile = new Path(conf.get(MRFrameworkConfigs.TASK_LOCAL_RESOURCE_DIR, MRFrameworkConfigs.TASK_LOCAL_RESOURCE_DIR_DEFAULT), MRJobConfig.JOB_SPLIT); LOG.info("Writing split to: " + jobSplitFile); FSDataOutputStream out = FileSystem.create(fs, jobSplitFile, new FsPermission(JOB_FILE_PERMISSION)); long offset = out.getPos(); Text.writeString(out, split.getClass().getName()); split.write(out);/*from w w w. ja va 2 s . c o m*/ out.close(); String[] locations = split.getLocations(); SplitMetaInfo info = null; info = new JobSplit.SplitMetaInfo(locations, offset, split.getLength()); Path jobSplitMetaInfoFile = new Path(conf.get(MRFrameworkConfigs.TASK_LOCAL_RESOURCE_DIR), MRJobConfig.JOB_SPLIT_METAINFO); FSDataOutputStream outMeta = FileSystem.create(fs, jobSplitMetaInfoFile, new FsPermission(JOB_FILE_PERMISSION)); outMeta.write(SplitMetaInfoReaderTez.META_SPLIT_FILE_HEADER); WritableUtils.writeVInt(outMeta, SplitMetaInfoReaderTez.META_SPLIT_VERSION); WritableUtils.writeVInt(outMeta, 1); // Only 1 split meta info being written info.write(outMeta); outMeta.close(); }