List of usage examples for org.apache.hadoop.fs FileSystem open
public FSDataInputStream open(PathHandle fd) throws IOException
From source file:com.marklogic.contentpump.DelimitedTextInputFormat.java
License:Apache License
public List<InputSplit> getSplits(JobContext job) throws IOException { boolean delimSplit = isSplitInput(job.getConfiguration()); //if delimSplit is true, size of each split is determined by //Math.max(minSize, Math.min(maxSize, blockSize)) in FileInputFormat List<InputSplit> splits = super.getSplits(job); if (!delimSplit) { return splits; }/*from w ww. j a v a2s.c om*/ if (splits.size() >= SPLIT_COUNT_LIMIT) { //if #splits > 1 million, there is enough parallelism //therefore no point to split LOG.warn("Exceeding SPLIT_COUNT_LIMIT, input_split is off:" + SPLIT_COUNT_LIMIT); DefaultStringifier.store(job.getConfiguration(), false, ConfigConstants.CONF_SPLIT_INPUT); return splits; } // add header info into splits List<InputSplit> populatedSplits = new ArrayList<InputSplit>(); LOG.info(splits.size() + " DelimitedSplits generated"); Configuration conf = job.getConfiguration(); char delimiter = 0; ArrayList<Text> hlist = new ArrayList<Text>(); for (InputSplit file : splits) { FileSplit fsplit = ((FileSplit) file); Path path = fsplit.getPath(); FileSystem fs = path.getFileSystem(conf); if (fsplit.getStart() == 0) { // parse the inSplit, get the header FSDataInputStream fileIn = fs.open(path); String delimStr = conf.get(ConfigConstants.CONF_DELIMITER, ConfigConstants.DEFAULT_DELIMITER); if (delimStr.length() == 1) { delimiter = delimStr.charAt(0); } else { LOG.error("Incorrect delimitor: " + delimiter + ". Expects single character."); } String encoding = conf.get(MarkLogicConstants.OUTPUT_CONTENT_ENCODING, MarkLogicConstants.DEFAULT_OUTPUT_CONTENT_ENCODING); InputStreamReader instream = new InputStreamReader(fileIn, encoding); CSVParser parser = new CSVParser(instream, CSVParserFormatter.getFormat(delimiter, DelimitedTextReader.encapsulator, true, true)); Iterator<CSVRecord> it = parser.iterator(); String[] header = null; if (it.hasNext()) { CSVRecord record = (CSVRecord) it.next(); Iterator<String> recordIterator = record.iterator(); int recordSize = record.size(); header = new String[recordSize]; for (int i = 0; i < recordSize; i++) { if (recordIterator.hasNext()) { header[i] = (String) recordIterator.next(); } else { throw new IOException("Record size doesn't match the real size"); } } EncodingUtil.handleBOMUTF8(header, 0); hlist.clear(); for (String s : header) { hlist.add(new Text(s)); } } instream.close(); } DelimitedSplit ds = new DelimitedSplit(new TextArrayWritable(hlist.toArray(new Text[hlist.size()])), path, fsplit.getStart(), fsplit.getLength(), fsplit.getLocations()); populatedSplits.add(ds); } return populatedSplits; }
From source file:com.marklogic.mapreduce.examples.ContentLoader.java
License:Apache License
@Override public void initialize(InputSplit inSplit, TaskAttemptContext context) throws IOException, InterruptedException { bytesTotal = inSplit.getLength();// w ww . j av a 2 s . c o m Path file = ((FileSplit) inSplit).getPath(); FileSystem fs = file.getFileSystem(context.getConfiguration()); FSDataInputStream fileIn = fs.open(file); key.set(file.toString()); byte[] buf = new byte[(int) inSplit.getLength()]; try { fileIn.readFully(buf); value.set(buf); hasNext = true; } catch (Exception e) { hasNext = false; } finally { fileIn.close(); } }
From source file:com.marklogic.mapreduce.examples.LinkCountHDFS.java
License:Apache License
@Override public void initialize(InputSplit inSplit, TaskAttemptContext context) throws IOException, InterruptedException { Path file = ((FileSplit) inSplit).getPath(); FileSystem fs = file.getFileSystem(context.getConfiguration()); FSDataInputStream fileIn = fs.open(file); DocumentBuilder docBuilder = builderLocal.get(); try {/* w w w.j av a 2 s.c o m*/ Document document = docBuilder.parse(fileIn); net.sf.saxon.s9api.DocumentBuilder db = saxonBuilderLocal.get(); XdmNode xdmDoc = db.wrap(document); XPathCompiler xpath = proc.newXPathCompiler(); xpath.declareNamespace("wp", "http://www.mediawiki.org/xml/export-0.4/"); XPathSelector selector = xpath.compile(PATH_EXPRESSION).load(); selector.setContextItem(xdmDoc); items = new ArrayList<XdmItem>(); for (XdmItem item : selector) { items.add(item); } } catch (SAXException ex) { ex.printStackTrace(); throw new IOException(ex); } catch (SaxonApiException e) { e.printStackTrace(); } finally { if (fileIn != null) { fileIn.close(); } } }
From source file:com.marklogic.mapreduce.examples.WikiLoader.java
License:Apache License
@Override public void initialize(InputSplit inSplit, TaskAttemptContext context) throws IOException, InterruptedException { Path file = ((FileSplit) inSplit).getPath(); FileSystem fs = file.getFileSystem(context.getConfiguration()); FSDataInputStream fileIn = fs.open(file); byte[] buf = new byte[BUFFER_SIZE]; long bytesTotal = inSplit.getLength(); long start = ((FileSplit) inSplit).getStart(); fileIn.seek(start);/*from w w w .j av a2s. c om*/ long bytesRead = 0; StringBuilder pages = new StringBuilder(); int sindex = -1; while (true) { int length = (int) Math.min(bytesTotal - bytesRead, buf.length); int read = fileIn.read(buf, 0, length); if (read == -1) { System.out.println("Unexpected EOF: bytesTotal=" + bytesTotal + "bytesRead=" + bytesRead); break; } bytesRead += read; String temp = new String(new String(buf, 0, read)); if (sindex == -1) { // haven't found the start yet sindex = temp.indexOf(BEGIN_PAGE_TAG); if (sindex > -1) { pages.append(temp.substring(sindex)); } } else if (bytesRead < bytesTotal) { // haven't completed the split pages.append(temp); } else { // reached the end of this split // look for end int eindex = 0; if (temp.contains(END_DOC_TAG) || // reached the end of doc temp.endsWith(END_PAGE_TAG)) { eindex = temp.lastIndexOf(END_PAGE_TAG); pages.append(temp.substring(0, eindex + END_PAGE_TAG.length())); System.out.println("Found end of doc."); } else { // need to read ahead to look for end of page while (true) { read = fileIn.read(buf, 0, READ_AHEAD_SIZE); if (read == -1) { // no more to read System.out .println("Unexpected EOF: bytesTotal=" + bytesTotal + "bytesRead=" + bytesRead); System.out.println(temp); break; } bytesRead += read; // look for end temp = new String(buf, 0, read); eindex = temp.indexOf(END_PAGE_TAG); if (eindex > -1) { pages.append(temp.substring(0, eindex + END_PAGE_TAG.length())); break; } else { pages.append(temp); } } } break; } } fileIn.close(); articles = WikiModelProcessor.process(pages); }
From source file:com.marklogic.mapreduce.examples.ZipContentLoader.java
License:Apache License
@Override public void initialize(InputSplit inSplit, TaskAttemptContext context) throws IOException, InterruptedException { Path file = ((FileSplit) inSplit).getPath(); FileSystem fs = file.getFileSystem(context.getConfiguration()); FSDataInputStream fileIn = fs.open(file); zipIn = new ZipInputStream(fileIn); }
From source file:com.marklogic.mapreduce.ForestInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); // generate splits List<InputSplit> splits = new ArrayList<InputSplit>(); List<FileStatus> files = listStatus(job); for (FileStatus file : files) { // stand directories Path path = file.getPath(); FileSystem fs = path.getFileSystem(job.getConfiguration()); FileStatus children[] = fs.listStatus(path); FileStatus treeIndexStatus = null, treeDataStatus = null, ordinalsStatus = null, timestampsStatus = null; boolean obsolete = false; for (FileStatus child : children) { String fileName = child.getPath().getName(); if (fileName.equals("TreeData")) { // inside a stand treeDataStatus = child;//w w w .ja v a2s .com } else if (fileName.equals("TreeIndex")) { treeIndexStatus = child; } else if (fileName.equals("Ordinals")) { ordinalsStatus = child; } else if (fileName.equals("Timestamps")) { timestampsStatus = child; } else if (fileName.equals("Obsolete")) { obsolete = true; break; } } if (obsolete) { LOG.warn("Obsolete file found. The forest is either live or isn't " + "dismounted cleanly. Ignoring stand " + path); break; } if (treeDataStatus == null) { throw new RuntimeException("TreeData file not found."); } else if (treeIndexStatus == null) { throw new RuntimeException("TreeIndex file not found."); } else if (ordinalsStatus == null) { throw new RuntimeException("Ordinals file not found."); } else if (timestampsStatus == null) { throw new RuntimeException("Timestamps file not found."); } long treeDataSize = treeDataStatus.getLen(); if (treeDataSize == 0) { // unexpected, give up this stand LOG.warn("Found empty TreeData file. Skipping..."); continue; // skipping this stand } Path treeDataPath = treeDataStatus.getPath(); long blockSize = treeDataStatus.getBlockSize(); long splitSize = computeSplitSize(blockSize, minSize, maxSize); // make splits based on TreeIndex FSDataInputStream is = fs.open(treeIndexStatus.getPath()); BiendianDataInputStream in = new BiendianDataInputStream(is); int prevDocid = -1, docid = -1, position = 0; long prevOffset = -1L, offset = 0, splitStart = 0; BlockLocation[] blkLocations = fs.getFileBlockLocations(treeDataStatus, 0, treeDataSize); try { for (;; ++position) { try { docid = in.readInt(); in.readInt(); offset = in.readLong(); } catch (EOFException e) { break; } int comp = InternalUtilities.compareUnsignedLong(offset, treeDataSize); if (comp > 0) { throw new RuntimeException("TreeIndex offset is out of bound: position = " + position + ", offset = " + offset + ", treeDataSize = " + treeDataSize); } if (prevDocid != -1 && (docid & 0xffffffffL) <= (prevDocid & 0xffffffffL)) { throw new RuntimeException("docid out of order, position = " + position + ", docid = " + docid + ", prevDocid = " + prevDocid); } prevDocid = docid; if (prevOffset != -1L && InternalUtilities.compareUnsignedLong(offset, prevOffset) <= 0) { throw new RuntimeException("offset out of order, position = " + position + ", offset = " + offset + ", prevOffset = " + prevOffset); } long splitLen = offset - splitStart; if (splitLen == splitSize || (splitLen > splitSize && splitLen - splitSize <= splitSize - (prevOffset - splitStart))) { int blkIndex = getBlockIndex(blkLocations, offset); InputSplit split = new FileSplit(treeDataPath, splitStart, splitLen, blkLocations[blkIndex].getHosts()); if (LOG.isDebugEnabled()) { LOG.debug("Created split: start=" + splitStart + " len=" + splitLen + " last docid=" + docid); } splits.add(split); splitStart = offset; } else if (splitLen > splitSize) { int blkIndex = getBlockIndex(blkLocations, prevOffset); InputSplit split = new FileSplit(treeDataPath, splitStart, prevOffset - splitStart, blkLocations[blkIndex].getHosts()); if (LOG.isDebugEnabled()) { LOG.debug("Created split: start=" + splitStart + " len=" + (prevOffset - splitStart) + " last docid=" + docid); } splits.add(split); splitStart = prevOffset; } } } finally { in.close(); } if (offset > splitStart) { int blkIndex = getBlockIndex(blkLocations, offset - 1); InputSplit split = new FileSplit(treeDataPath, splitStart, offset - splitStart, blkLocations[blkIndex].getHosts()); if (LOG.isDebugEnabled()) { LOG.debug("Created split: start=" + splitStart + " len=" + (offset - splitStart) + " last docid=" + docid); } splits.add(split); } } if (LOG.isDebugEnabled()) { LOG.debug("Made " + splits.size() + " splits."); } return splits; }
From source file:com.marklogic.mapreduce.ForestReader.java
License:Apache License
@Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { this.split = (FileSplit) split; conf = context.getConfiguration();/*from w w w. j a v a2 s.co m*/ Path dataPath = this.split.getPath(); FileSystem fs = dataPath.getFileSystem(conf); dataIs = new BiendianDataInputStream(fs.open(dataPath)); dataIs.skipBytes(this.split.getStart()); Path ordPath = new Path(dataPath.getParent(), "Ordinals"); ordIs = new BiendianDataInputStream(fs.open(ordPath)); Path tsPath = new Path(dataPath.getParent(), "Timestamps"); tsIs = new BiendianDataInputStream(fs.open(tsPath)); valueClass = conf.getClass(INPUT_VALUE_CLASS, ForestDocument.class, Writable.class); if (!ForestDocument.class.isAssignableFrom(valueClass)) { throw new IllegalArgumentException("Unsupported " + INPUT_VALUE_CLASS); } largeForestDir = new Path(dataPath.getParent().getParent(), "Large"); colFilters = conf.getStringCollection(COLLECTION_FILTER); dirFilters = conf.getStringCollection(DIRECTORY_FILTER); Collection<String> addedDirs = null; for (Iterator<String> it = dirFilters.iterator(); it.hasNext();) { String dir = it.next(); if (!dir.endsWith("/")) { String newDir = dir + "/"; it.remove(); if (addedDirs == null) { addedDirs = new ArrayList<String>(); } addedDirs.add(newDir); } } if (addedDirs != null) { dirFilters.addAll(addedDirs); } typeFilters = conf.getStringCollection(TYPE_FILTER); }
From source file:com.marklogic.mapreduce.LargeBinaryDocument.java
License:Apache License
public byte[] getContentAsByteArray(int offset, int len) { FileSystem fs; FSDataInputStream is = null;//w ww .j a v a 2s .c om try { fs = path.getFileSystem(conf); if (!fs.exists(path)) { throw new RuntimeException("File not found: " + path); } FileStatus status = fs.getFileStatus(path); if (status.getLen() < offset) { throw new RuntimeException("Reached end of file: " + path); } byte[] buf = new byte[len]; is = fs.open(path); for (int toSkip = offset, skipped = 0; toSkip < offset; toSkip -= skipped) { skipped = is.skipBytes(offset); } for (int bytesRead = 0; bytesRead < len;) { bytesRead += is.read(buf, bytesRead, len - bytesRead); } return buf; } catch (IOException e) { throw new RuntimeException("Error accessing file: " + path, e); } finally { if (is != null) { try { is.close(); } catch (IOException e) { } } } }
From source file:com.marklogic.mapreduce.LargeBinaryDocument.java
License:Apache License
@Override public InputStream getContentAsByteStream() { FileSystem fs; FSDataInputStream is = null;/*www . j ava 2s . c om*/ try { fs = path.getFileSystem(conf); if (!fs.exists(path)) { throw new RuntimeException("File not found: " + path); } is = fs.open(path); return is; } catch (IOException e) { throw new RuntimeException("Error accessing file: " + path, e); } }
From source file:com.marklogic.mapreduce.test.FCheck.java
License:Apache License
public void checkTreeIndex(File dir) throws IOException { File file = new File(dir, "TreeIndex"); if (verbose)/*from www . j a v a 2s . c o m*/ System.out.println(file.getAbsolutePath() + " -> checkTreeIndex"); // BiendianDataInputStream in = openFile(file, 1 << 18); Path path = new Path(dir.getAbsolutePath()); FileSystem fs = path.getFileSystem(new Configuration()); FileStatus children[] = fs.listStatus(path); FileStatus treeIndexStatus = null, treeDataStatus = null; for (FileStatus child : children) { String fileName = child.getPath().getName(); if (fileName.equals("TreeData")) { // inside a stand treeDataStatus = child; } else if (fileName.equals("TreeIndex")) { treeIndexStatus = child; } if (treeDataStatus != null && treeIndexStatus != null) { break; } } if (treeDataStatus == null) { throw new RuntimeException("TreeData file not found."); } else if (treeIndexStatus == null) { throw new RuntimeException("TreeIndex file not found."); } long treeDataSize = treeDataStatus.getLen(); if (treeDataSize == 0) { // unexpected, give up this stand System.err.println("Found empty TreeData file. Skipping..."); return; } FSDataInputStream is = fs.open(treeIndexStatus.getPath()); BiendianDataInputStream in = new BiendianDataInputStream(is); in.setLittleEndian(littleEndian); int prevDocid = -1; long prevOffset = -1L; int position = 0; int docid; long offset; for (;; ++position) { try { docid = in.readInt(); in.readInt(); offset = in.readLong(); } catch (EOFException e) { break; } if (debug) { System.out.println(String.format("TreeIndex p %08x d %08x o %016x", position, docid, offset)); } if (compareUnsignedLong(offset, treeDataSize) >= 0) { panic(file, String.format("offset out of range, position=%d, offset=%d, treeDataSize=%d", position, offset, treeDataSize)); } if (prevDocid != -1 && (docid & 0xffffffffL) <= (prevDocid & 0xffffffffL)) { panic(file, String.format("docid out of order, position=%d, docid=%d, prevDocid=%d", position, docid, prevDocid)); } prevDocid = docid; if (prevOffset != -1L && compareUnsignedLong(offset, prevOffset) <= 0) { panic(file, String.format("offset out of order, position=%d, offset=%d, prevOffset=%d", position, offset, prevOffset)); } prevOffset = offset; } if (verbose) System.out.println(file.getAbsolutePath() + " <- checkTreeIndex [" + position + "]"); }