Example usage for org.apache.hadoop.fs FileSystem open

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem open.

Prototype

public FSDataInputStream open(PathHandle fd) throws IOException

Source Link

Document

Open an FSDataInputStream matching the PathHandle instance.

Usage

From source file:com.marklogic.contentpump.DelimitedTextInputFormat.java

License:Apache License

public List<InputSplit> getSplits(JobContext job) throws IOException {
    boolean delimSplit = isSplitInput(job.getConfiguration());
    //if delimSplit is true, size of each split is determined by 
    //Math.max(minSize, Math.min(maxSize, blockSize)) in FileInputFormat
    List<InputSplit> splits = super.getSplits(job);
    if (!delimSplit) {
        return splits;
    }/*from   w  ww.  j  a  v a2s.c  om*/

    if (splits.size() >= SPLIT_COUNT_LIMIT) {
        //if #splits > 1 million, there is enough parallelism
        //therefore no point to split
        LOG.warn("Exceeding SPLIT_COUNT_LIMIT, input_split is off:" + SPLIT_COUNT_LIMIT);
        DefaultStringifier.store(job.getConfiguration(), false, ConfigConstants.CONF_SPLIT_INPUT);
        return splits;
    }
    // add header info into splits
    List<InputSplit> populatedSplits = new ArrayList<InputSplit>();
    LOG.info(splits.size() + " DelimitedSplits generated");
    Configuration conf = job.getConfiguration();
    char delimiter = 0;
    ArrayList<Text> hlist = new ArrayList<Text>();
    for (InputSplit file : splits) {
        FileSplit fsplit = ((FileSplit) file);
        Path path = fsplit.getPath();
        FileSystem fs = path.getFileSystem(conf);

        if (fsplit.getStart() == 0) {
            // parse the inSplit, get the header
            FSDataInputStream fileIn = fs.open(path);

            String delimStr = conf.get(ConfigConstants.CONF_DELIMITER, ConfigConstants.DEFAULT_DELIMITER);
            if (delimStr.length() == 1) {
                delimiter = delimStr.charAt(0);
            } else {
                LOG.error("Incorrect delimitor: " + delimiter + ". Expects single character.");
            }
            String encoding = conf.get(MarkLogicConstants.OUTPUT_CONTENT_ENCODING,
                    MarkLogicConstants.DEFAULT_OUTPUT_CONTENT_ENCODING);
            InputStreamReader instream = new InputStreamReader(fileIn, encoding);
            CSVParser parser = new CSVParser(instream,
                    CSVParserFormatter.getFormat(delimiter, DelimitedTextReader.encapsulator, true, true));
            Iterator<CSVRecord> it = parser.iterator();

            String[] header = null;
            if (it.hasNext()) {
                CSVRecord record = (CSVRecord) it.next();
                Iterator<String> recordIterator = record.iterator();
                int recordSize = record.size();
                header = new String[recordSize];
                for (int i = 0; i < recordSize; i++) {
                    if (recordIterator.hasNext()) {
                        header[i] = (String) recordIterator.next();
                    } else {
                        throw new IOException("Record size doesn't match the real size");
                    }
                }

                EncodingUtil.handleBOMUTF8(header, 0);

                hlist.clear();
                for (String s : header) {
                    hlist.add(new Text(s));
                }
            }
            instream.close();
        }

        DelimitedSplit ds = new DelimitedSplit(new TextArrayWritable(hlist.toArray(new Text[hlist.size()])),
                path, fsplit.getStart(), fsplit.getLength(), fsplit.getLocations());
        populatedSplits.add(ds);
    }

    return populatedSplits;
}

From source file:com.marklogic.mapreduce.examples.ContentLoader.java

License:Apache License

@Override
public void initialize(InputSplit inSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    bytesTotal = inSplit.getLength();//  w ww .  j  av  a  2  s  .  c o m
    Path file = ((FileSplit) inSplit).getPath();
    FileSystem fs = file.getFileSystem(context.getConfiguration());
    FSDataInputStream fileIn = fs.open(file);
    key.set(file.toString());
    byte[] buf = new byte[(int) inSplit.getLength()];
    try {
        fileIn.readFully(buf);
        value.set(buf);
        hasNext = true;
    } catch (Exception e) {
        hasNext = false;
    } finally {
        fileIn.close();
    }
}

From source file:com.marklogic.mapreduce.examples.LinkCountHDFS.java

License:Apache License

@Override
public void initialize(InputSplit inSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    Path file = ((FileSplit) inSplit).getPath();
    FileSystem fs = file.getFileSystem(context.getConfiguration());
    FSDataInputStream fileIn = fs.open(file);
    DocumentBuilder docBuilder = builderLocal.get();
    try {/* w  w w.j  av  a 2 s.c o  m*/
        Document document = docBuilder.parse(fileIn);
        net.sf.saxon.s9api.DocumentBuilder db = saxonBuilderLocal.get();
        XdmNode xdmDoc = db.wrap(document);
        XPathCompiler xpath = proc.newXPathCompiler();
        xpath.declareNamespace("wp", "http://www.mediawiki.org/xml/export-0.4/");
        XPathSelector selector = xpath.compile(PATH_EXPRESSION).load();
        selector.setContextItem(xdmDoc);
        items = new ArrayList<XdmItem>();
        for (XdmItem item : selector) {
            items.add(item);
        }
    } catch (SAXException ex) {
        ex.printStackTrace();
        throw new IOException(ex);
    } catch (SaxonApiException e) {
        e.printStackTrace();
    } finally {
        if (fileIn != null) {
            fileIn.close();
        }
    }
}

From source file:com.marklogic.mapreduce.examples.WikiLoader.java

License:Apache License

@Override
public void initialize(InputSplit inSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    Path file = ((FileSplit) inSplit).getPath();
    FileSystem fs = file.getFileSystem(context.getConfiguration());
    FSDataInputStream fileIn = fs.open(file);
    byte[] buf = new byte[BUFFER_SIZE];
    long bytesTotal = inSplit.getLength();
    long start = ((FileSplit) inSplit).getStart();
    fileIn.seek(start);/*from   w  w w .j av  a2s. c om*/
    long bytesRead = 0;
    StringBuilder pages = new StringBuilder();
    int sindex = -1;
    while (true) {
        int length = (int) Math.min(bytesTotal - bytesRead, buf.length);
        int read = fileIn.read(buf, 0, length);
        if (read == -1) {
            System.out.println("Unexpected EOF: bytesTotal=" + bytesTotal + "bytesRead=" + bytesRead);
            break;
        }
        bytesRead += read;
        String temp = new String(new String(buf, 0, read));
        if (sindex == -1) { // haven't found the start yet    
            sindex = temp.indexOf(BEGIN_PAGE_TAG);
            if (sindex > -1) {
                pages.append(temp.substring(sindex));
            }
        } else if (bytesRead < bytesTotal) { // haven't completed the split
            pages.append(temp);
        } else { // reached the end of this split
            // look for end
            int eindex = 0;
            if (temp.contains(END_DOC_TAG) || // reached the end of doc
                    temp.endsWith(END_PAGE_TAG)) {
                eindex = temp.lastIndexOf(END_PAGE_TAG);
                pages.append(temp.substring(0, eindex + END_PAGE_TAG.length()));
                System.out.println("Found end of doc.");
            } else { // need to read ahead to look for end of page
                while (true) {
                    read = fileIn.read(buf, 0, READ_AHEAD_SIZE);
                    if (read == -1) { // no more to read
                        System.out
                                .println("Unexpected EOF: bytesTotal=" + bytesTotal + "bytesRead=" + bytesRead);
                        System.out.println(temp);
                        break;
                    }
                    bytesRead += read;
                    // look for end
                    temp = new String(buf, 0, read);
                    eindex = temp.indexOf(END_PAGE_TAG);
                    if (eindex > -1) {
                        pages.append(temp.substring(0, eindex + END_PAGE_TAG.length()));
                        break;
                    } else {
                        pages.append(temp);
                    }
                }
            }
            break;
        }
    }
    fileIn.close();
    articles = WikiModelProcessor.process(pages);
}

From source file:com.marklogic.mapreduce.examples.ZipContentLoader.java

License:Apache License

@Override
public void initialize(InputSplit inSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    Path file = ((FileSplit) inSplit).getPath();
    FileSystem fs = file.getFileSystem(context.getConfiguration());
    FSDataInputStream fileIn = fs.open(file);
    zipIn = new ZipInputStream(fileIn);
}

From source file:com.marklogic.mapreduce.ForestInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);
    for (FileStatus file : files) { // stand directories
        Path path = file.getPath();
        FileSystem fs = path.getFileSystem(job.getConfiguration());
        FileStatus children[] = fs.listStatus(path);
        FileStatus treeIndexStatus = null, treeDataStatus = null, ordinalsStatus = null,
                timestampsStatus = null;
        boolean obsolete = false;
        for (FileStatus child : children) {
            String fileName = child.getPath().getName();
            if (fileName.equals("TreeData")) { // inside a stand
                treeDataStatus = child;//w w  w .ja  v  a2s .com
            } else if (fileName.equals("TreeIndex")) {
                treeIndexStatus = child;
            } else if (fileName.equals("Ordinals")) {
                ordinalsStatus = child;
            } else if (fileName.equals("Timestamps")) {
                timestampsStatus = child;
            } else if (fileName.equals("Obsolete")) {
                obsolete = true;
                break;
            }
        }
        if (obsolete) {
            LOG.warn("Obsolete file found.  The forest is either live or isn't "
                    + "dismounted cleanly.  Ignoring stand " + path);
            break;
        }
        if (treeDataStatus == null) {
            throw new RuntimeException("TreeData file not found.");
        } else if (treeIndexStatus == null) {
            throw new RuntimeException("TreeIndex file not found.");
        } else if (ordinalsStatus == null) {
            throw new RuntimeException("Ordinals file not found.");
        } else if (timestampsStatus == null) {
            throw new RuntimeException("Timestamps file not found.");
        }
        long treeDataSize = treeDataStatus.getLen();
        if (treeDataSize == 0) {
            // unexpected, give up this stand
            LOG.warn("Found empty TreeData file.  Skipping...");
            continue; // skipping this stand
        }
        Path treeDataPath = treeDataStatus.getPath();
        long blockSize = treeDataStatus.getBlockSize();
        long splitSize = computeSplitSize(blockSize, minSize, maxSize);
        // make splits based on TreeIndex
        FSDataInputStream is = fs.open(treeIndexStatus.getPath());
        BiendianDataInputStream in = new BiendianDataInputStream(is);
        int prevDocid = -1, docid = -1, position = 0;
        long prevOffset = -1L, offset = 0, splitStart = 0;
        BlockLocation[] blkLocations = fs.getFileBlockLocations(treeDataStatus, 0, treeDataSize);
        try {
            for (;; ++position) {
                try {
                    docid = in.readInt();
                    in.readInt();
                    offset = in.readLong();
                } catch (EOFException e) {
                    break;
                }
                int comp = InternalUtilities.compareUnsignedLong(offset, treeDataSize);
                if (comp > 0) {
                    throw new RuntimeException("TreeIndex offset is out of bound: position = " + position
                            + ", offset = " + offset + ", treeDataSize = " + treeDataSize);
                }
                if (prevDocid != -1 && (docid & 0xffffffffL) <= (prevDocid & 0xffffffffL)) {
                    throw new RuntimeException("docid out of order, position = " + position + ", docid = "
                            + docid + ", prevDocid = " + prevDocid);
                }
                prevDocid = docid;
                if (prevOffset != -1L && InternalUtilities.compareUnsignedLong(offset, prevOffset) <= 0) {
                    throw new RuntimeException("offset out of order, position = " + position + ", offset = "
                            + offset + ", prevOffset = " + prevOffset);
                }
                long splitLen = offset - splitStart;
                if (splitLen == splitSize || (splitLen > splitSize
                        && splitLen - splitSize <= splitSize - (prevOffset - splitStart))) {
                    int blkIndex = getBlockIndex(blkLocations, offset);
                    InputSplit split = new FileSplit(treeDataPath, splitStart, splitLen,
                            blkLocations[blkIndex].getHosts());
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("Created split: start=" + splitStart + " len=" + splitLen + " last docid="
                                + docid);
                    }
                    splits.add(split);
                    splitStart = offset;
                } else if (splitLen > splitSize) {
                    int blkIndex = getBlockIndex(blkLocations, prevOffset);
                    InputSplit split = new FileSplit(treeDataPath, splitStart, prevOffset - splitStart,
                            blkLocations[blkIndex].getHosts());
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("Created split: start=" + splitStart + " len=" + (prevOffset - splitStart)
                                + " last docid=" + docid);
                    }
                    splits.add(split);
                    splitStart = prevOffset;
                }
            }
        } finally {
            in.close();
        }
        if (offset > splitStart) {
            int blkIndex = getBlockIndex(blkLocations, offset - 1);
            InputSplit split = new FileSplit(treeDataPath, splitStart, offset - splitStart,
                    blkLocations[blkIndex].getHosts());
            if (LOG.isDebugEnabled()) {
                LOG.debug("Created split: start=" + splitStart + " len=" + (offset - splitStart)
                        + " last docid=" + docid);
            }

            splits.add(split);
        }
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug("Made " + splits.size() + " splits.");
    }

    return splits;
}

From source file:com.marklogic.mapreduce.ForestReader.java

License:Apache License

@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
    this.split = (FileSplit) split;
    conf = context.getConfiguration();/*from  w w w.  j  a v a2  s.co  m*/
    Path dataPath = this.split.getPath();
    FileSystem fs = dataPath.getFileSystem(conf);
    dataIs = new BiendianDataInputStream(fs.open(dataPath));
    dataIs.skipBytes(this.split.getStart());
    Path ordPath = new Path(dataPath.getParent(), "Ordinals");
    ordIs = new BiendianDataInputStream(fs.open(ordPath));
    Path tsPath = new Path(dataPath.getParent(), "Timestamps");
    tsIs = new BiendianDataInputStream(fs.open(tsPath));
    valueClass = conf.getClass(INPUT_VALUE_CLASS, ForestDocument.class, Writable.class);
    if (!ForestDocument.class.isAssignableFrom(valueClass)) {
        throw new IllegalArgumentException("Unsupported " + INPUT_VALUE_CLASS);
    }
    largeForestDir = new Path(dataPath.getParent().getParent(), "Large");
    colFilters = conf.getStringCollection(COLLECTION_FILTER);
    dirFilters = conf.getStringCollection(DIRECTORY_FILTER);
    Collection<String> addedDirs = null;
    for (Iterator<String> it = dirFilters.iterator(); it.hasNext();) {
        String dir = it.next();
        if (!dir.endsWith("/")) {
            String newDir = dir + "/";
            it.remove();
            if (addedDirs == null) {
                addedDirs = new ArrayList<String>();
            }
            addedDirs.add(newDir);
        }
    }
    if (addedDirs != null) {
        dirFilters.addAll(addedDirs);
    }
    typeFilters = conf.getStringCollection(TYPE_FILTER);
}

From source file:com.marklogic.mapreduce.LargeBinaryDocument.java

License:Apache License

public byte[] getContentAsByteArray(int offset, int len) {
    FileSystem fs;
    FSDataInputStream is = null;//w ww  .j  a  v  a 2s .c om
    try {
        fs = path.getFileSystem(conf);
        if (!fs.exists(path)) {
            throw new RuntimeException("File not found: " + path);
        }
        FileStatus status = fs.getFileStatus(path);
        if (status.getLen() < offset) {
            throw new RuntimeException("Reached end of file: " + path);
        }
        byte[] buf = new byte[len];
        is = fs.open(path);
        for (int toSkip = offset, skipped = 0; toSkip < offset; toSkip -= skipped) {
            skipped = is.skipBytes(offset);
        }
        for (int bytesRead = 0; bytesRead < len;) {
            bytesRead += is.read(buf, bytesRead, len - bytesRead);
        }
        return buf;
    } catch (IOException e) {
        throw new RuntimeException("Error accessing file: " + path, e);
    } finally {
        if (is != null) {
            try {
                is.close();
            } catch (IOException e) {
            }
        }
    }
}

From source file:com.marklogic.mapreduce.LargeBinaryDocument.java

License:Apache License

@Override
public InputStream getContentAsByteStream() {
    FileSystem fs;
    FSDataInputStream is = null;/*www .  j ava  2s . c om*/
    try {
        fs = path.getFileSystem(conf);
        if (!fs.exists(path)) {
            throw new RuntimeException("File not found: " + path);
        }
        is = fs.open(path);
        return is;
    } catch (IOException e) {
        throw new RuntimeException("Error accessing file: " + path, e);
    }
}

From source file:com.marklogic.mapreduce.test.FCheck.java

License:Apache License

public void checkTreeIndex(File dir) throws IOException {
    File file = new File(dir, "TreeIndex");
    if (verbose)/*from   www  . j a  v a  2s .  c  o m*/
        System.out.println(file.getAbsolutePath() + " -> checkTreeIndex");
    //      BiendianDataInputStream in = openFile(file, 1 << 18);
    Path path = new Path(dir.getAbsolutePath());
    FileSystem fs = path.getFileSystem(new Configuration());
    FileStatus children[] = fs.listStatus(path);
    FileStatus treeIndexStatus = null, treeDataStatus = null;
    for (FileStatus child : children) {
        String fileName = child.getPath().getName();
        if (fileName.equals("TreeData")) { // inside a stand
            treeDataStatus = child;
        } else if (fileName.equals("TreeIndex")) {
            treeIndexStatus = child;
        }
        if (treeDataStatus != null && treeIndexStatus != null) {
            break;
        }
    }
    if (treeDataStatus == null) {
        throw new RuntimeException("TreeData file not found.");
    } else if (treeIndexStatus == null) {
        throw new RuntimeException("TreeIndex file not found.");
    }
    long treeDataSize = treeDataStatus.getLen();
    if (treeDataSize == 0) {
        // unexpected, give up this stand
        System.err.println("Found empty TreeData file.  Skipping...");
        return;
    }
    FSDataInputStream is = fs.open(treeIndexStatus.getPath());
    BiendianDataInputStream in = new BiendianDataInputStream(is);
    in.setLittleEndian(littleEndian);
    int prevDocid = -1;
    long prevOffset = -1L;
    int position = 0;
    int docid;
    long offset;
    for (;; ++position) {
        try {
            docid = in.readInt();
            in.readInt();
            offset = in.readLong();
        } catch (EOFException e) {
            break;
        }
        if (debug) {
            System.out.println(String.format("TreeIndex p %08x d %08x o %016x", position, docid, offset));
        }
        if (compareUnsignedLong(offset, treeDataSize) >= 0) {
            panic(file, String.format("offset out of range, position=%d, offset=%d, treeDataSize=%d", position,
                    offset, treeDataSize));
        }
        if (prevDocid != -1 && (docid & 0xffffffffL) <= (prevDocid & 0xffffffffL)) {
            panic(file, String.format("docid out of order, position=%d, docid=%d, prevDocid=%d", position,
                    docid, prevDocid));
        }
        prevDocid = docid;
        if (prevOffset != -1L && compareUnsignedLong(offset, prevOffset) <= 0) {
            panic(file, String.format("offset out of order, position=%d, offset=%d, prevOffset=%d", position,
                    offset, prevOffset));
        }
        prevOffset = offset;
    }
    if (verbose)
        System.out.println(file.getAbsolutePath() + " <- checkTreeIndex [" + position + "]");
}