List of usage examples for org.apache.hadoop.fs Path getFileSystem
public FileSystem getFileSystem(Configuration conf) throws IOException
From source file:com.fullcontact.sstable.hadoop.mapreduce.SSTableInputFormat.java
License:Apache License
/** * If we have a directory recursively gather the files we care about for this job. * * @param file Root file/directory.//from ww w . j a v a2 s . com * @param job Job context. * @return All files we care about. * @throws IOException */ private Collection<FileStatus> handleFile(final FileStatus file, final JobContext job) throws IOException { final List<FileStatus> results = Lists.newArrayList(); if (file.isDir()) { final Path p = file.getPath(); LOG.debug("Expanding {}", p); final FileSystem fs = p.getFileSystem(job.getConfiguration()); final FileStatus[] children = fs.listStatus(p); for (FileStatus child : children) { results.addAll(handleFile(child, job)); } } else { results.add(file); } return results; }
From source file:com.fullcontact.sstable.hadoop.mapreduce.SSTableInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(final JobContext job) throws IOException { final Configuration configuration = job.getConfiguration(); final List<InputSplit> result = Lists.newArrayList(); final List<FileStatus> files = listStatus(job); LOG.debug("Initial file list: {} {}", files.size(), files); for (final FileStatus fileStatus : files) { final Path dataFile = fileStatus.getPath(); final FileSystem fileSystem = dataFile.getFileSystem(configuration); final BlockLocation[] blockLocations = fileSystem.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());/*from w ww .j a va 2 s . c om*/ // Data file, try to split if the .index file was found final SSTableIndexIndex index = indexes.get(dataFile); if (index == null) { throw new IOException("Index not found for " + dataFile); } for (final SSTableIndexIndex.Chunk chunk : index.getOffsets()) { // This isn't likely to work well because we are dealing with the index into uncompressed data... final int blockIndex = getBlockIndex(blockLocations, chunk.getStart() / COMPRESSION_RATIO_ASSUMPTION); final SSTableSplit split = new SSTableSplit(dataFile, chunk.getStart(), chunk.getEnd(), chunk.getEnd() - chunk.getStart(), blockLocations[blockIndex].getHosts()); result.add(split); } } LOG.debug("Splits calculated: {} {}", result.size(), result); return result; }
From source file:com.gemstone.gemfire.cache.hdfs.internal.hoplog.mapreduce.GFInputFormat.java
License:Apache License
/** * Identifies filters provided in the job configuration and creates a list of * sorted hoplogs. If there are no sorted hoplogs, checks if the region has * sequential hoplogs// w ww . j av a2 s.c o m * * @return list of hoplogs * @throws IOException */ protected Collection<FileStatus> getHoplogs() throws IOException { String regionName = conf.get(INPUT_REGION); System.out.println("GFInputFormat: Region Name is " + regionName); if (regionName == null || regionName.trim().isEmpty()) { // incomplete job configuration, region name must be provided return new ArrayList<FileStatus>(); } String home = conf.get(HOME_DIR, HDFSStore.DEFAULT_HOME_DIR); regionName = HdfsRegionManager.getRegionFolder(regionName); Path regionPath = new Path(home + "/" + regionName); FileSystem fs = regionPath.getFileSystem(conf); long start = conf.getLong(START_TIME, 0l); long end = conf.getLong(END_TIME, 0l); boolean checkpoint = conf.getBoolean(CHECKPOINT, true); // if the region contains flush hoplogs then the region is of type RW. Collection<FileStatus> hoplogs; hoplogs = HoplogUtil.filterHoplogs(fs, regionPath, start, end, checkpoint); return hoplogs == null ? new ArrayList<FileStatus>() : hoplogs; }
From source file:com.geneix.bottle.WordRecordReader.java
License:Apache License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { if (LOG.isInfoEnabled()) { LOG.info("Initializing WordRecordReader"); }//w w w . ja v a2s . c o m FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxWordLength = job.getInt(MAX_WORD_LENGTH, Integer.MAX_VALUE); start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); // open the file and seek to the start of the split final FileSystem fs = file.getFileSystem(job); fileIn = fs.open(file); CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file); if (null != codec) { throw new IOException("Cannot handle compressed files right now"); } else { fileIn.seek(start); in = new WordReader(fileIn, job); filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readWord(new Text(), 0, maxBytesToConsume(start)); } this.pos = start; }
From source file:com.github.bskaggs.avro_json_hadoop.AvroAsJsonRecordReader.java
License:Apache License
/** {@inheritDoc} */ @Override//from w w w. j a v a 2 s .co m public void initialize(InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException { if (!(inputSplit instanceof FileSplit)) { throw new IllegalArgumentException("Only compatible with FileSplits."); } FileSplit fileSplit = (FileSplit) inputSplit; // Open a seekable input stream to the Avro container file. SeekableInput seekableFileInput = createSeekableInput(context.getConfiguration(), fileSplit.getPath()); // Wrap the seekable input stream in an Avro DataFileReader. Configuration conf = context.getConfiguration(); GenericData dataModel = AvroSerialization.createDataModel(conf); GenericDatumReader<Object> reader = new GenericDatumReader<Object>(); //Figure out the schema Path path = fileSplit.getPath(); FSDataInputStream schemaStream = path.getFileSystem(conf).open(path); DataFileStream<Object> streamReader = new DataFileStream<Object>(schemaStream, reader); Schema mReaderSchema = streamReader.getSchema(); streamReader.close(); //Set up writer and encoder for json writer = new GenericDatumWriter<Object>(mReaderSchema); encoder = new TerseJsonEncoder(mReaderSchema, bout); @SuppressWarnings("unchecked") DatumReader<Object> datumReader = dataModel.createDatumReader(mReaderSchema); mAvroFileReader = createAvroFileReader(seekableFileInput, datumReader); // Initialize the start and end offsets into the file based on the boundaries of the // input split we're responsible for. We will read the first block that begins // after the input split start boundary. We will read up to but not including the // first block that starts after input split end boundary. // Sync to the closest block/record boundary just after beginning of our input split. mAvroFileReader.sync(fileSplit.getStart()); // Initialize the start position to the beginning of the first block of the input split. mStartPosition = mAvroFileReader.previousSync(); // Initialize the end position to the end of the input split (this isn't necessarily // on a block boundary so using this for reporting progress will be approximate. mEndPosition = fileSplit.getStart() + fileSplit.getLength(); }
From source file:com.github.bskaggs.mapreduce.flowfile.AbstractFlowFileV3RecordReader.java
License:Apache License
@Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit fileSplit = (FileSplit) split; Path file = fileSplit.getPath(); FileSystem fs = file.getFileSystem(context.getConfiguration()); fileStream = fs.open(file);// w w w. j ava2 s . c om startPos = fileSplit.getStart(); nextPos = startPos; length = fileSplit.getLength(); lastPos = nextPos + length; }
From source file:com.github.dryangkun.hbase.tidx.hive.HiveHFileOutputFormat.java
License:Apache License
@Override public RecordWriter getHiveRecordWriter(final JobConf jc, final Path finalOutPath, Class<? extends Writable> valueClass, boolean isCompressed, Properties tableProperties, final Progressable progressable) throws IOException { // Read configuration for the target path, first from jobconf, then from table properties String hfilePath = getFamilyPath(jc, tableProperties); if (hfilePath == null) { throw new RuntimeException("Please set " + HFILE_FAMILY_PATH + " to target location for HFiles"); }//from w w w . j a v a 2 s . c om // Target path's last component is also the column family name. final Path columnFamilyPath = new Path(hfilePath); final String columnFamilyName = columnFamilyPath.getName(); final byte[] columnFamilyNameBytes = Bytes.toBytes(columnFamilyName); final Job job = new Job(jc); setCompressOutput(job, isCompressed); setOutputPath(job, finalOutPath); // Create the HFile writer final org.apache.hadoop.mapreduce.TaskAttemptContext tac = ShimLoader.getHadoopShims() .newTaskAttemptContext(job.getConfiguration(), progressable); final Path outputdir = FileOutputFormat.getOutputPath(tac); final org.apache.hadoop.mapreduce.RecordWriter<ImmutableBytesWritable, KeyValue> fileWriter = getFileWriter( tac); // Individual columns are going to be pivoted to HBase cells, // and for each row, they need to be written out in order // of column name, so sort the column names now, creating a // mapping to their column position. However, the first // column is interpreted as the row key. String columnList = tableProperties.getProperty("columns"); String[] columnArray = columnList.split(","); final SortedMap<byte[], Integer> columnMap = new TreeMap<byte[], Integer>(Bytes.BYTES_COMPARATOR); int i = 0; for (String columnName : columnArray) { if (i != 0) { columnMap.put(Bytes.toBytes(columnName), i); } ++i; } return new RecordWriter() { @Override public void close(boolean abort) throws IOException { try { fileWriter.close(null); if (abort) { return; } // Move the hfiles file(s) from the task output directory to the // location specified by the user. FileSystem fs = outputdir.getFileSystem(jc); fs.mkdirs(columnFamilyPath); Path srcDir = outputdir; for (;;) { FileStatus[] files = fs.listStatus(srcDir, FileUtils.STAGING_DIR_PATH_FILTER); if ((files == null) || (files.length == 0)) { throw new IOException("No family directories found in " + srcDir); } if (files.length != 1) { throw new IOException("Multiple family directories found in " + srcDir); } srcDir = files[0].getPath(); if (srcDir.getName().equals(columnFamilyName)) { break; } } for (FileStatus regionFile : fs.listStatus(srcDir, FileUtils.STAGING_DIR_PATH_FILTER)) { fs.rename(regionFile.getPath(), new Path(columnFamilyPath, regionFile.getPath().getName())); } // Hive actually wants a file as task output (not a directory), so // replace the empty directory with an empty file to keep it happy. fs.delete(outputdir, true); fs.createNewFile(outputdir); } catch (InterruptedException ex) { throw new IOException(ex); } } private void writeText(Text text) throws IOException { // Decompose the incoming text row into fields. String s = text.toString(); String[] fields = s.split("\u0001"); assert (fields.length <= (columnMap.size() + 1)); // First field is the row key. byte[] rowKeyBytes = Bytes.toBytes(fields[0]); // Remaining fields are cells addressed by column name within row. for (Map.Entry<byte[], Integer> entry : columnMap.entrySet()) { byte[] columnNameBytes = entry.getKey(); int iColumn = entry.getValue(); String val; if (iColumn >= fields.length) { // trailing blank field val = ""; } else { val = fields[iColumn]; if ("\\N".equals(val)) { // omit nulls continue; } } byte[] valBytes = Bytes.toBytes(val); KeyValue kv = new KeyValue(rowKeyBytes, columnFamilyNameBytes, columnNameBytes, valBytes); try { fileWriter.write(null, kv); } catch (IOException e) { LOG.error("Failed while writing row: " + s); throw e; } catch (InterruptedException ex) { throw new IOException(ex); } } } private void writePut(PutWritable put) throws IOException { ImmutableBytesWritable row = new ImmutableBytesWritable(put.getPut().getRow()); SortedMap<byte[], List<Cell>> cells = put.getPut().getFamilyCellMap(); for (Map.Entry<byte[], List<Cell>> entry : cells.entrySet()) { Collections.sort(entry.getValue(), new CellComparator()); for (Cell c : entry.getValue()) { try { fileWriter.write(row, KeyValueUtil.copyToNewKeyValue(c)); } catch (InterruptedException e) { throw (InterruptedIOException) new InterruptedIOException().initCause(e); } } } } @Override public void write(Writable w) throws IOException { if (w instanceof Text) { writeText((Text) w); } else if (w instanceof PutWritable) { writePut((PutWritable) w); } else { throw new IOException("Unexpected writable " + w); } } }; }
From source file:com.github.gaoyangthu.demo.mapred.terasort.TeraSort.java
License:Apache License
public int run(String[] args) throws Exception { LOG.info("starting"); JobConf job = (JobConf) getConf();//from w w w . j a v a2s . c o m Path inputDir = new Path(args[0]); inputDir = inputDir.makeQualified(inputDir.getFileSystem(job)); Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME); URI partitionUri = new URI(partitionFile.toString() + "#" + TeraInputFormat.PARTITION_FILENAME); TeraInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setJobName("TeraSort"); job.setJarByClass(TeraSort.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormat(TeraInputFormat.class); job.setOutputFormat(TeraOutputFormat.class); job.setPartitionerClass(TotalOrderPartitioner.class); TeraInputFormat.writePartitionFile(job, partitionFile); DistributedCache.addCacheFile(partitionUri, job); DistributedCache.createSymlink(job); job.setInt("dfs.replication", 1); TeraOutputFormat.setFinalSync(job, true); JobClient.runJob(job); LOG.info("done"); return 0; }
From source file:com.github.joshelser.accumulo.DelimitedIngest.java
License:Apache License
private List<Path> convertInputToPaths() throws IOException { List<String> inputs = args.getInput(); List<Path> paths = new ArrayList<>(inputs.size()); for (String input : inputs) { Path p = new Path(input); FileSystem fs = p.getFileSystem(conf); FileStatus fstat = fs.getFileStatus(p); if (fstat.isFile()) { paths.add(p);/*from w w w . j a v a 2 s . c o m*/ } else if (fstat.isDirectory()) { for (FileStatus child : fs.listStatus(p)) { if (child.isFile()) { paths.add(child.getPath()); } } } else { throw new IllegalStateException("Unable to handle that which is not file nor directory: " + p); } } return paths; }
From source file:com.github.joshelser.accumulo.DelimitedIngest.java
License:Apache License
private void processSinglePathWithByteBuffer(BatchWriter writer, FileMapping mapping, Path p, CsvParser parser) throws IOException, MutationsRejectedException { final FileSystem fs = p.getFileSystem(conf); FSDataInputStream dis = fs.open(p, INPUT_BUFFER_SIZE); InputStreamReader reader = new InputStreamReader(dis, UTF_8); try {/* w ww . ja va2 s .co m*/ parser.beginParsing(reader); String[] line = null; while ((line = parser.parseNext()) != null) { writer.addMutation(parseLine(mapping, line)); } } finally { if (null != reader) { reader.close(); } } }