List of usage examples for org.apache.hadoop.fs Path toUri
public URI toUri()
From source file:com.marklogic.contentpump.FileAndDirectoryInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { List<InputSplit> splits = new ArrayList<InputSplit>(); Configuration conf = job.getConfiguration(); try {//from w w w . ja va2s .com List<FileStatus> files = listStatus(job); long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); for (FileStatus child : files) { Path path = child.getPath(); FileSystem fs = path.getFileSystem(conf); // length is 0 for dir according to FSDirectory.java in 0.20 // however, w/ Hadoop2, dir in local fs has non-zero length long length = child.getLen(); BlockLocation[] blkLocations = null; if (!child.isDirectory() || fs instanceof DistributedFileSystem == false) { blkLocations = fs.getFileBlockLocations(child, 0, length); } else if (length != 0) { throw new IOException("non-zero length directory on HDFS:" + path.toUri().toString()); } if ((length != 0) && isSplitable(job, path)) { long blockSize = child.getBlockSize(); long splitSize = computeSplitSize(blockSize, minSize, maxSize); long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(new FileSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts())); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())); } } else if (length != 0) { splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts())); } else { // Create empty hosts array for zero length files splits.add(new FileSplit(path, 0, length, new String[0])); } } } catch (InvalidInputException ex) { String inPath = conf.get(ConfigConstants.CONF_INPUT_DIRECTORY); String pattern = conf.get(ConfigConstants.CONF_INPUT_FILE_PATTERN, ".*"); throw new IOException("No input files found with the specified input path " + inPath + " and input file pattern " + pattern, ex); } PathFilter jobFilter = getInputPathFilter(job); List<PathFilter> filters = new ArrayList<PathFilter>(); filters.add(hiddenFileFilter); if (jobFilter != null) { filters.add(jobFilter); } PathFilter inputFilter = new MultiPathFilter(filters); // take a second pass of the splits generated to extract files from // directories int count = 0; // flatten directories until reaching SPLIT_COUNT_LIMIT while (count < splits.size() && splits.size() < SPLIT_COUNT_LIMIT) { FileSplit split = (FileSplit) splits.get(count); Path file = split.getPath(); FileSystem fs = file.getFileSystem(conf); FileStatus status = fs.getFileStatus(file); if (status.isDirectory()) { FileStatus[] children = fs.listStatus(file, inputFilter); if (children.length + count < SPLIT_COUNT_LIMIT) { splits.remove(count); for (FileStatus stat : children) { FileSplit child = new FileSplit(stat.getPath(), 0, stat.getLen(), null); splits.add(child); } } else { count++; } } else { count++; } } return splits; }
From source file:com.marklogic.contentpump.ImportRecordReader.java
License:Apache License
protected String makeURIFromPath(Path file) { // get path portion of the file return file.toUri().getPath().toString(); }
From source file:com.marklogic.contentpump.ImportRecordReader.java
License:Apache License
protected String makeURIForZipEntry(Path zipFile, String val) { Path path = new Path(zipFile, val); return path.toUri().getPath(); }
From source file:com.marklogic.contentpump.OutputArchive.java
License:Apache License
private void newOutputStream() throws IOException { // use the constructor filename for the first zip, // then add filecount to subsequent archives, if any. int count = fileCount.getAndIncrement(); currPath = newPackagePath(basePath, count, 6); if (outputStream != null) { if (LOG.isDebugEnabled()) { LOG.debug("closing output archive: " + currPath); }//ww w .j av a2s. c o m outputStream.flush(); outputStream.close(); } currentFileBytes = 0; currentEntries = 0; Path zpath = new Path(currPath); FileSystem fs = zpath.getFileSystem(conf); if (fs.exists(zpath)) { throw new IOException(zpath + " already exists."); } if (LOG.isDebugEnabled()) { LOG.debug("Creating output archive: " + zpath); LOG.debug("Default charset: " + Charset.defaultCharset()); } // if fs instanceof DistributedFileSystem, use hadoop api; otherwise, // use java api if (fs instanceof DistributedFileSystem) { FSDataOutputStream fsout = fs.create(zpath, false); outputStream = new ZipOutputStream(fsout); } else { File f = new File(zpath.toUri().getPath()); if (!f.exists()) { f.getParentFile().mkdirs(); f.createNewFile(); } FileOutputStream fos = new FileOutputStream(f, false); outputStream = new ZipOutputStream(fos); } }
From source file:com.marklogic.contentpump.SingleDocumentWriter.java
License:Apache License
@Override public void write(DocumentURI uri, MarkLogicDocument content) throws IOException, InterruptedException { OutputStream os = null;// w w w. j a v a2 s . c o m try { String childPath = URIUtil.getPathFromURI(uri); Path path; if (childPath.charAt(0) == '/') { // concatenate outputPath with path to form the path path = new Path(dir.toString() + childPath); } else { path = new Path(dir, childPath); } FileSystem fs = path.getFileSystem(conf); if (fs instanceof DistributedFileSystem) { os = fs.create(path, false); } else { File f = new File(path.toUri().getPath()); if (!f.exists()) { f.getParentFile().mkdirs(); f.createNewFile(); } os = new FileOutputStream(f, false); } ContentType type = content.getContentType(); if (ContentType.BINARY.equals(type)) { if (content.isStreamable()) { InputStream is = null; try { is = content.getContentAsByteStream(); long size = content.getContentSize(); long bufSize = Math.min(size, 512 << 10); byte[] buf = new byte[(int) bufSize]; for (long toRead = size, read = 0; toRead > 0; toRead -= read) { read = is.read(buf, 0, (int) bufSize); if (read > 0) { os.write(buf, 0, (int) read); } else { LOG.error("Premature EOF: uri=" + uri + ",toRead=" + toRead); break; } } } finally { if (is != null) { is.close(); } } } else { os.write(content.getContentAsByteArray()); } } else if (ContentType.TEXT.equals(type) || ContentType.XML.equals(type) || ContentType.JSON.equals(type)) { if (encoding.equals("UTF-8")) { Text t = content.getContentAsText(); os.write(t.getBytes(), 0, t.getLength()); } else { String t = content.getContentAsString(); os.write(t.getBytes(encoding)); } if (LOG.isTraceEnabled()) { Text t = content.getContentAsText(); LOG.trace(t); byte[] bytes = content.getContentAsByteArray(); StringBuilder sb = new StringBuilder(); for (int i = 0; i < bytes.length; i++) { sb.append(Byte.toString(bytes[i])); sb.append(" "); } LOG.trace(sb); } } else { LOG.error("Skipping " + uri + ". Unsupported content type: " + type.name()); } } catch (Exception e) { LOG.error("Error saving: " + uri, e); } finally { if (os != null) { os.close(); } } }
From source file:com.marklogic.mapreduce.ContentWriter.java
License:Apache License
protected Content createContent(DocumentURI key, VALUEOUT value) throws IOException { String uri = key.getUri();//ww w .ja v a 2 s. c o m Content content = null; if (value instanceof Text) { if (formatNeeded) { options.setFormat(DocumentFormat.TEXT); formatNeeded = false; } options.setEncoding(DEFAULT_OUTPUT_CONTENT_ENCODING); content = ContentFactory.newContent(uri, ((Text) value).getBytes(), 0, ((Text) value).getLength(), options); } else if (value instanceof MarkLogicNode) { if (formatNeeded) { options.setFormat(DocumentFormat.XML); formatNeeded = false; } content = ContentFactory.newContent(uri, ((MarkLogicNode) value).get(), options); } else if (value instanceof DOMDocument) { content = ContentFactory.newContent(uri, ((DOMDocument) value).getContentAsMarkLogicNode().get(), options); } else if (value instanceof JSONDocument) { JSONDocument doc = (JSONDocument) value; content = ContentFactory.newContent(uri, doc.getContentAsByteArray(), options); } else if (value instanceof BinaryDocument) { BinaryDocument doc = (BinaryDocument) value; if (doc.isStreamable()) { InputStream is = null; try { is = doc.getContentAsByteStream(); content = ContentFactory.newUnBufferedContent(uri, is, options); } catch (Exception ex) { if (is != null) { is.close(); } LOG.error("Error accessing large binary document " + key + ", skipping...", ex); } } else { content = ContentFactory.newContent(uri, doc.getContentAsByteArray(), options); } } else if (value instanceof BytesWritable) { if (formatNeeded) { options.setFormat(DocumentFormat.BINARY); formatNeeded = false; } content = ContentFactory.newContent(uri, ((BytesWritable) value).getBytes(), 0, ((BytesWritable) value).getLength(), options); } else if (value instanceof CustomContent) { ContentCreateOptions newOptions = options; if (batchSize > 1) { newOptions = (ContentCreateOptions) options.clone(); } content = ((CustomContent) value).getContent(conf, newOptions, uri); } else if (value instanceof DatabaseDocument) { DatabaseDocument doc = (DatabaseDocument) value; if (formatNeeded) { options.setFormat(doc.getContentType().getDocumentFormat()); formatNeeded = false; } options.setEncoding(DEFAULT_OUTPUT_CONTENT_ENCODING); if (doc.getContentType() == ContentType.BINARY) { content = ContentFactory.newContent(uri, doc.getContentAsByteArray(), options); } else { content = ContentFactory.newContent(uri, doc.getContentAsText().getBytes(), options); } } else if (value instanceof StreamLocator) { Path path = ((StreamLocator) value).getPath(); if (fs == null) { URI fileUri = path.toUri(); fs = FileSystem.get(fileUri, conf); } switch (((StreamLocator) value).getCodec()) { case GZIP: InputStream fileIn = fs.open(path); is = new GZIPInputStream(fileIn); break; case ZIP: if (is == null) { InputStream zipfileIn = fs.open(path); ZipInputStream zis = new ZipInputStream(zipfileIn); is = new ZipEntryInputStream(zis, path.toString()); } break; case NONE: is = fs.open(path); break; default: LOG.error("Unsupported compression codec: " + ((StreamLocator) value).getCodec() + " for document " + key); return content; } if (streaming) { content = ContentFactory.newUnBufferedContent(uri, is, options); } else { content = ContentFactory.newContent(uri, is, options); } } else { throw new UnsupportedOperationException(value.getClass() + " is not supported."); } return content; }
From source file:com.mellanox.r4h.CorruptFileBlockIterator.java
License:Apache License
private String path2String(Path path) { return path.toUri().getPath(); }
From source file:com.mellanox.r4h.DistributedFileSystem.java
License:Apache License
/** * Checks that the passed URI belongs to this filesystem and returns * just the path component. Expects a URI with an absolute path. * /* www. j a v a 2 s . c o m*/ * @param file * URI with absolute path * @return path component of {file} * @throws IllegalArgumentException * if URI does not belong to this DFS */ private String getPathName(Path file) { checkPath(file); String result = file.toUri().getPath(); if (!DFSUtil.isValidName(result)) { throw new IllegalArgumentException( "Pathname " + result + " from " + file + " is not a valid DFS filename."); } return result; }
From source file:com.modofo.molo.cluster.DisplayClustering.java
License:Apache License
protected static void writeSampleData(Path output) throws IOException { Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(output.toUri(), conf); SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, output, Text.class, VectorWritable.class); try {/*from w w w. j a v a 2s . c o m*/ int i = 0; for (VectorWritable vw : SAMPLE_DATA) { writer.append(new Text("sample_" + i++), vw); } } finally { Closeables.closeQuietly(writer); } }
From source file:com.moz.fiji.mapreduce.framework.MapReduceJobBuilder.java
License:Apache License
/** * Adds a directory of jars to the distributed cache of the job. * * @param jarDirectory The path to a directory of jar files. * Path may be qualified (eg. "hdfs://path/to/dir", or "file:/path/to/dir"), * or unqualified (eg. "path/to/dir"), in which case it will be resovled against the * local file system.//from w ww. j av a 2 s . c o m * @return This builder instance so you may chain configuration method calls. */ @SuppressWarnings("unchecked") public T addJarDirectory(Path jarDirectory) { Path jarDirPath = null; try { jarDirPath = (jarDirectory.toUri().getScheme() == null) ? new Path("file:" + jarDirectory) : jarDirectory; } catch (IllegalArgumentException iae) { // A URISyntaxException was thrown by the Path c'tor, wrapped in an // IllegalArgumentException. Meaning the Path is a relative path, not an absolute one, // and contains no scheme identifier. Canonicalize the filename and add the "file:" // scheme prefix to the canonizalized Path jarDirPath = new Path("file:" + new File(jarDirectory.toString()).getAbsolutePath()); } mJarDirectories.add(jarDirPath); return (T) this; }