Example usage for org.apache.hadoop.fs Path toUri

List of usage examples for org.apache.hadoop.fs Path toUri

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path toUri.

Prototype

public URI toUri() 

Source Link

Document

Convert this Path to a URI.

Usage

From source file:com.marklogic.contentpump.FileAndDirectoryInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    List<InputSplit> splits = new ArrayList<InputSplit>();
    Configuration conf = job.getConfiguration();
    try {//from w  w  w  . ja  va2s .com
        List<FileStatus> files = listStatus(job);

        long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
        long maxSize = getMaxSplitSize(job);
        for (FileStatus child : files) {
            Path path = child.getPath();
            FileSystem fs = path.getFileSystem(conf);
            // length is 0 for dir according to FSDirectory.java in 0.20
            // however, w/ Hadoop2, dir in local fs has non-zero length
            long length = child.getLen();
            BlockLocation[] blkLocations = null;
            if (!child.isDirectory() || fs instanceof DistributedFileSystem == false) {
                blkLocations = fs.getFileBlockLocations(child, 0, length);
            } else if (length != 0) {
                throw new IOException("non-zero length directory on HDFS:" + path.toUri().toString());
            }

            if ((length != 0) && isSplitable(job, path)) {
                long blockSize = child.getBlockSize();
                long splitSize = computeSplitSize(blockSize, minSize, maxSize);

                long bytesRemaining = length;
                while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(new FileSplit(path, length - bytesRemaining, splitSize,
                            blkLocations[blkIndex].getHosts()));
                    bytesRemaining -= splitSize;
                }

                if (bytesRemaining != 0) {
                    splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining,
                            blkLocations[blkLocations.length - 1].getHosts()));
                }
            } else if (length != 0) {
                splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
            } else {
                // Create empty hosts array for zero length files
                splits.add(new FileSplit(path, 0, length, new String[0]));
            }
        }
    } catch (InvalidInputException ex) {
        String inPath = conf.get(ConfigConstants.CONF_INPUT_DIRECTORY);
        String pattern = conf.get(ConfigConstants.CONF_INPUT_FILE_PATTERN, ".*");
        throw new IOException("No input files found with the specified input path " + inPath
                + " and input file pattern " + pattern, ex);
    }

    PathFilter jobFilter = getInputPathFilter(job);
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(hiddenFileFilter);
    if (jobFilter != null) {
        filters.add(jobFilter);
    }
    PathFilter inputFilter = new MultiPathFilter(filters);
    // take a second pass of the splits generated to extract files from
    // directories
    int count = 0;
    // flatten directories until reaching SPLIT_COUNT_LIMIT
    while (count < splits.size() && splits.size() < SPLIT_COUNT_LIMIT) {
        FileSplit split = (FileSplit) splits.get(count);
        Path file = split.getPath();
        FileSystem fs = file.getFileSystem(conf);
        FileStatus status = fs.getFileStatus(file);
        if (status.isDirectory()) {
            FileStatus[] children = fs.listStatus(file, inputFilter);
            if (children.length + count < SPLIT_COUNT_LIMIT) {
                splits.remove(count);
                for (FileStatus stat : children) {
                    FileSplit child = new FileSplit(stat.getPath(), 0, stat.getLen(), null);
                    splits.add(child);
                }
            } else {
                count++;
            }
        } else {
            count++;
        }
    }
    return splits;
}

From source file:com.marklogic.contentpump.ImportRecordReader.java

License:Apache License

protected String makeURIFromPath(Path file) {
    // get path portion of the file
    return file.toUri().getPath().toString();
}

From source file:com.marklogic.contentpump.ImportRecordReader.java

License:Apache License

protected String makeURIForZipEntry(Path zipFile, String val) {
    Path path = new Path(zipFile, val);
    return path.toUri().getPath();
}

From source file:com.marklogic.contentpump.OutputArchive.java

License:Apache License

private void newOutputStream() throws IOException {
    // use the constructor filename for the first zip,
    // then add filecount to subsequent archives, if any.
    int count = fileCount.getAndIncrement();
    currPath = newPackagePath(basePath, count, 6);
    if (outputStream != null) {
        if (LOG.isDebugEnabled()) {
            LOG.debug("closing output archive: " + currPath);
        }//ww w .j av  a2s. c o  m
        outputStream.flush();
        outputStream.close();
    }
    currentFileBytes = 0;
    currentEntries = 0;

    Path zpath = new Path(currPath);
    FileSystem fs = zpath.getFileSystem(conf);
    if (fs.exists(zpath)) {
        throw new IOException(zpath + " already exists.");
    }

    if (LOG.isDebugEnabled()) {
        LOG.debug("Creating output archive: " + zpath);
        LOG.debug("Default charset: " + Charset.defaultCharset());
    }
    // if fs instanceof DistributedFileSystem, use hadoop api; otherwise,
    // use java api
    if (fs instanceof DistributedFileSystem) {
        FSDataOutputStream fsout = fs.create(zpath, false);
        outputStream = new ZipOutputStream(fsout);
    } else {
        File f = new File(zpath.toUri().getPath());
        if (!f.exists()) {
            f.getParentFile().mkdirs();
            f.createNewFile();
        }
        FileOutputStream fos = new FileOutputStream(f, false);
        outputStream = new ZipOutputStream(fos);
    }

}

From source file:com.marklogic.contentpump.SingleDocumentWriter.java

License:Apache License

@Override
public void write(DocumentURI uri, MarkLogicDocument content) throws IOException, InterruptedException {
    OutputStream os = null;// w w w. j a  v  a2 s .  c  o  m
    try {
        String childPath = URIUtil.getPathFromURI(uri);
        Path path;
        if (childPath.charAt(0) == '/') {
            // concatenate outputPath with path to form the path
            path = new Path(dir.toString() + childPath);
        } else {
            path = new Path(dir, childPath);
        }
        FileSystem fs = path.getFileSystem(conf);
        if (fs instanceof DistributedFileSystem) {
            os = fs.create(path, false);
        } else {
            File f = new File(path.toUri().getPath());
            if (!f.exists()) {
                f.getParentFile().mkdirs();
                f.createNewFile();
            }
            os = new FileOutputStream(f, false);
        }

        ContentType type = content.getContentType();
        if (ContentType.BINARY.equals(type)) {
            if (content.isStreamable()) {
                InputStream is = null;
                try {
                    is = content.getContentAsByteStream();
                    long size = content.getContentSize();
                    long bufSize = Math.min(size, 512 << 10);
                    byte[] buf = new byte[(int) bufSize];
                    for (long toRead = size, read = 0; toRead > 0; toRead -= read) {
                        read = is.read(buf, 0, (int) bufSize);
                        if (read > 0) {
                            os.write(buf, 0, (int) read);
                        } else {
                            LOG.error("Premature EOF: uri=" + uri + ",toRead=" + toRead);
                            break;
                        }
                    }
                } finally {
                    if (is != null) {
                        is.close();
                    }
                }
            } else {
                os.write(content.getContentAsByteArray());
            }
        } else if (ContentType.TEXT.equals(type) || ContentType.XML.equals(type)
                || ContentType.JSON.equals(type)) {
            if (encoding.equals("UTF-8")) {
                Text t = content.getContentAsText();
                os.write(t.getBytes(), 0, t.getLength());
            } else {
                String t = content.getContentAsString();
                os.write(t.getBytes(encoding));
            }
            if (LOG.isTraceEnabled()) {
                Text t = content.getContentAsText();
                LOG.trace(t);
                byte[] bytes = content.getContentAsByteArray();
                StringBuilder sb = new StringBuilder();
                for (int i = 0; i < bytes.length; i++) {
                    sb.append(Byte.toString(bytes[i]));
                    sb.append(" ");
                }
                LOG.trace(sb);
            }
        } else {
            LOG.error("Skipping " + uri + ".  Unsupported content type: " + type.name());
        }
    } catch (Exception e) {
        LOG.error("Error saving: " + uri, e);
    } finally {
        if (os != null) {
            os.close();
        }
    }
}

From source file:com.marklogic.mapreduce.ContentWriter.java

License:Apache License

protected Content createContent(DocumentURI key, VALUEOUT value) throws IOException {
    String uri = key.getUri();//ww w .ja v a 2 s.  c  o  m
    Content content = null;
    if (value instanceof Text) {
        if (formatNeeded) {
            options.setFormat(DocumentFormat.TEXT);
            formatNeeded = false;
        }
        options.setEncoding(DEFAULT_OUTPUT_CONTENT_ENCODING);
        content = ContentFactory.newContent(uri, ((Text) value).getBytes(), 0, ((Text) value).getLength(),
                options);
    } else if (value instanceof MarkLogicNode) {
        if (formatNeeded) {
            options.setFormat(DocumentFormat.XML);
            formatNeeded = false;
        }
        content = ContentFactory.newContent(uri, ((MarkLogicNode) value).get(), options);
    } else if (value instanceof DOMDocument) {
        content = ContentFactory.newContent(uri, ((DOMDocument) value).getContentAsMarkLogicNode().get(),
                options);
    } else if (value instanceof JSONDocument) {
        JSONDocument doc = (JSONDocument) value;
        content = ContentFactory.newContent(uri, doc.getContentAsByteArray(), options);
    } else if (value instanceof BinaryDocument) {
        BinaryDocument doc = (BinaryDocument) value;
        if (doc.isStreamable()) {
            InputStream is = null;
            try {
                is = doc.getContentAsByteStream();
                content = ContentFactory.newUnBufferedContent(uri, is, options);
            } catch (Exception ex) {
                if (is != null) {
                    is.close();
                }
                LOG.error("Error accessing large binary document " + key + ", skipping...", ex);
            }
        } else {
            content = ContentFactory.newContent(uri, doc.getContentAsByteArray(), options);
        }
    } else if (value instanceof BytesWritable) {
        if (formatNeeded) {
            options.setFormat(DocumentFormat.BINARY);
            formatNeeded = false;
        }
        content = ContentFactory.newContent(uri, ((BytesWritable) value).getBytes(), 0,
                ((BytesWritable) value).getLength(), options);
    } else if (value instanceof CustomContent) {
        ContentCreateOptions newOptions = options;
        if (batchSize > 1) {
            newOptions = (ContentCreateOptions) options.clone();
        }
        content = ((CustomContent) value).getContent(conf, newOptions, uri);
    } else if (value instanceof DatabaseDocument) {
        DatabaseDocument doc = (DatabaseDocument) value;
        if (formatNeeded) {
            options.setFormat(doc.getContentType().getDocumentFormat());
            formatNeeded = false;
        }
        options.setEncoding(DEFAULT_OUTPUT_CONTENT_ENCODING);
        if (doc.getContentType() == ContentType.BINARY) {
            content = ContentFactory.newContent(uri, doc.getContentAsByteArray(), options);
        } else {
            content = ContentFactory.newContent(uri, doc.getContentAsText().getBytes(), options);
        }
    } else if (value instanceof StreamLocator) {
        Path path = ((StreamLocator) value).getPath();
        if (fs == null) {
            URI fileUri = path.toUri();
            fs = FileSystem.get(fileUri, conf);
        }
        switch (((StreamLocator) value).getCodec()) {
        case GZIP:
            InputStream fileIn = fs.open(path);
            is = new GZIPInputStream(fileIn);
            break;
        case ZIP:
            if (is == null) {
                InputStream zipfileIn = fs.open(path);
                ZipInputStream zis = new ZipInputStream(zipfileIn);
                is = new ZipEntryInputStream(zis, path.toString());
            }
            break;
        case NONE:
            is = fs.open(path);
            break;
        default:
            LOG.error("Unsupported compression codec: " + ((StreamLocator) value).getCodec() + " for document "
                    + key);
            return content;
        }
        if (streaming) {
            content = ContentFactory.newUnBufferedContent(uri, is, options);
        } else {
            content = ContentFactory.newContent(uri, is, options);
        }

    } else {
        throw new UnsupportedOperationException(value.getClass() + " is not supported.");
    }
    return content;
}

From source file:com.mellanox.r4h.CorruptFileBlockIterator.java

License:Apache License

private String path2String(Path path) {
    return path.toUri().getPath();
}

From source file:com.mellanox.r4h.DistributedFileSystem.java

License:Apache License

/**
 * Checks that the passed URI belongs to this filesystem and returns
 * just the path component. Expects a URI with an absolute path.
 * /* www. j  a  v a  2 s .  c o  m*/
 * @param file
 *            URI with absolute path
 * @return path component of {file}
 * @throws IllegalArgumentException
 *             if URI does not belong to this DFS
 */
private String getPathName(Path file) {
    checkPath(file);
    String result = file.toUri().getPath();
    if (!DFSUtil.isValidName(result)) {
        throw new IllegalArgumentException(
                "Pathname " + result + " from " + file + " is not a valid DFS filename.");
    }
    return result;
}

From source file:com.modofo.molo.cluster.DisplayClustering.java

License:Apache License

protected static void writeSampleData(Path output) throws IOException {
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(output.toUri(), conf);
    SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, output, Text.class, VectorWritable.class);
    try {/*from   w w w.  j  a  v  a  2s  . c o m*/
        int i = 0;
        for (VectorWritable vw : SAMPLE_DATA) {
            writer.append(new Text("sample_" + i++), vw);
        }
    } finally {
        Closeables.closeQuietly(writer);
    }
}

From source file:com.moz.fiji.mapreduce.framework.MapReduceJobBuilder.java

License:Apache License

/**
 * Adds a directory of jars to the distributed cache of the job.
 *
 * @param jarDirectory The path to a directory of jar files.
 *     Path may be qualified (eg. "hdfs://path/to/dir", or "file:/path/to/dir"),
 *     or unqualified (eg. "path/to/dir"), in which case it will be resovled against the
 *     local file system.//from   w  ww. j av a 2 s  . c o  m
 * @return This builder instance so you may chain configuration method calls.
 */
@SuppressWarnings("unchecked")
public T addJarDirectory(Path jarDirectory) {
    Path jarDirPath = null;
    try {
        jarDirPath = (jarDirectory.toUri().getScheme() == null) ? new Path("file:" + jarDirectory)
                : jarDirectory;
    } catch (IllegalArgumentException iae) {
        // A URISyntaxException was thrown by the Path c'tor, wrapped in an
        // IllegalArgumentException.  Meaning the Path is a relative path, not an absolute one,
        // and contains no scheme identifier. Canonicalize the filename and add the "file:"
        // scheme prefix to the canonizalized Path
        jarDirPath = new Path("file:" + new File(jarDirectory.toString()).getAbsolutePath());
    }

    mJarDirectories.add(jarDirPath);
    return (T) this;
}