Example usage for org.apache.hadoop.mapred LineRecordReader LineRecordReader

List of usage examples for org.apache.hadoop.mapred LineRecordReader LineRecordReader

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred LineRecordReader LineRecordReader.

Prototype

public LineRecordReader(InputStream in, long offset, long endOffset, Configuration job) throws IOException 

Source Link

Usage

From source file:cascading.tap.hadoop.ZipInputFormat.java

License:Open Source License

private RecordReader<LongWritable, Text> getReaderForAll(final FSDataInputStream inputStream)
        throws IOException {
    final long bytesSize[] = new long[] { 0 };
    final long bytesRead[] = new long[] { 0 };

    Enumeration<InputStream> enumeration = new Enumeration<InputStream>() {
        boolean returnCurrent = false;
        ZipEntry nextEntry;//from w  w w . j a v  a2 s .  c  o  m
        ZipInputStream zipInputStream = new ZipInputStream(inputStream);
        InputStream closeableInputStream = makeInputStream(zipInputStream);

        public boolean hasMoreElements() {
            if (returnCurrent)
                return nextEntry != null;

            getNext();

            return nextEntry != null;
        }

        public InputStream nextElement() {
            if (returnCurrent) {
                returnCurrent = false;
                return closeableInputStream;
            }

            getNext();

            if (nextEntry == null)
                throw new IllegalStateException("no more zip entries in zip input stream");

            return closeableInputStream;
        }

        private void getNext() {
            try {
                nextEntry = zipInputStream.getNextEntry();

                while (nextEntry != null && nextEntry.isDirectory())
                    nextEntry = zipInputStream.getNextEntry();

                if (nextEntry != null)
                    bytesSize[0] += nextEntry.getSize();

                returnCurrent = true;
            } catch (IOException exception) {
                throw new RuntimeException("could not get next zip entry", exception);
            } finally {
                // i think, better than sending across a fake input stream that closes the zip
                if (nextEntry == null)
                    safeClose(zipInputStream);
            }
        }

        private InputStream makeInputStream(ZipInputStream zipInputStream) {
            return new FilterInputStream(zipInputStream) {
                @Override
                public int read() throws IOException {
                    bytesRead[0]++;
                    return super.read();
                }

                @Override
                public int read(byte[] bytes) throws IOException {
                    int result = super.read(bytes);
                    bytesRead[0] += result;
                    return result;
                }

                @Override
                public int read(byte[] bytes, int i, int i1) throws IOException {
                    int result = super.read(bytes, i, i1);
                    bytesRead[0] += result;
                    return result;
                }

                @Override
                public long skip(long l) throws IOException {
                    long result = super.skip(l);
                    bytesRead[0] += result;
                    return result;
                }

                @Override
                public void close() throws IOException {
                    // do nothing
                }
            };
        }
    };

    return new LineRecordReader(new SequenceInputStream(enumeration), 0, Long.MAX_VALUE, Integer.MAX_VALUE) {
        @Override
        public float getProgress() {
            if (0 == bytesSize[0])
                return 0.0f;
            else
                return Math.min(1.0f, bytesRead[0] / (float) bytesSize[0]);
        }
    };
}

From source file:cascading.tap.hadoop.ZipInputFormat.java

License:Open Source License

private RecordReader<LongWritable, Text> getReaderForEntry(FSDataInputStream inputStream, ZipSplit split,
        long length) throws IOException {
    ZipInputStream zipInputStream = new ZipInputStream(inputStream);
    String entryPath = split.getEntryPath();

    ZipEntry zipEntry = zipInputStream.getNextEntry();

    while (zipEntry != null && !zipEntry.getName().equals(entryPath))
        zipEntry = zipInputStream.getNextEntry();

    return new LineRecordReader(zipInputStream, 0, length, Integer.MAX_VALUE);
}

From source file:edu.umn.cs.spatialHadoop.mapred.SpatialInputFormat.java

License:Open Source License

protected void listStatus(final FileSystem fs, Path dir, final List<FileStatus> result, BlockFilter filter)
        throws IOException {
    GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(fs, dir);
    if (gindex == null) {
        FileStatus[] listStatus;/*from  www . ja  v a2 s. c  o  m*/
        if (OperationsParams.isWildcard(dir)) {
            // Wild card
            listStatus = fs.globStatus(dir);
        } else {
            listStatus = fs.listStatus(dir, SpatialSite.NonHiddenFileFilter);
        }
        // Add all files under this directory
        for (FileStatus status : listStatus) {
            if (status.isDir()) {
                listStatus(fs, status.getPath(), result, filter);
            } else if (status.getPath().getName().toLowerCase().endsWith(".list")) {
                LineRecordReader in = new LineRecordReader(fs.open(status.getPath()), 0, status.getLen(),
                        Integer.MAX_VALUE);
                LongWritable key = in.createKey();
                Text value = in.createValue();
                while (in.next(key, value)) {
                    result.add(fs.getFileStatus(new Path(status.getPath().getParent(), value.toString())));
                }
                in.close();
            } else {
                result.add(status);
            }
        }
    } else {
        final Path indexDir = OperationsParams.isWildcard(dir) ? dir.getParent() : dir;
        // Use the global index to limit files
        filter.selectCells(gindex, new ResultCollector<Partition>() {
            @Override
            public void collect(Partition partition) {
                try {
                    Path cell_path = new Path(indexDir, partition.filename);
                    if (!fs.exists(cell_path))
                        LOG.warn("Matched file not found: " + cell_path);
                    result.add(fs.getFileStatus(cell_path));
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        });
    }
}

From source file:edu.umn.cs.spatialHadoop.operations.CatUnion.java

License:Open Source License

/**
 * Read all categories from the category file
 * @param categoryFile//from  w  w w.j a v a 2s  .c  o m
 * @param categoryShapes
 * @param idToCategory
 * @throws IOException
 */
private static void readCategories(Path categoryFile, Map<Integer, Integer> idToCategory) throws IOException {
    Map<Integer, String> idToCatName = new HashMap<Integer, String>();
    FileSystem fsCategory = FileSystem.getLocal(new Configuration());
    long categoryFileSize = fsCategory.getFileStatus(categoryFile).getLen();
    if (categoryFileSize > 1024 * 1024)
        LOG.warn("Category file size is big: " + categoryFileSize);
    InputStream inCategory = fsCategory.open(categoryFile);
    LineRecordReader lineReader = new LineRecordReader(inCategory, 0, categoryFileSize, new Configuration());
    LongWritable lineOffset = lineReader.createKey();
    Text line = lineReader.createValue();

    Set<String> catNames = new TreeSet<String>();

    while (lineReader.next(lineOffset, line)) {
        int shape_id = TextSerializerHelper.consumeInt(line, ',');
        String cat_name = line.toString();
        catNames.add(cat_name);
        idToCatName.put(shape_id, cat_name);
    }

    lineReader.close();

    // Change category names to numbers
    Map<String, Integer> cat_name_to_id = new HashMap<String, Integer>();
    int cat_id = 0;
    for (String cat_name : catNames) {
        cat_name_to_id.put(cat_name, cat_id++);
    }

    for (Map.Entry<Integer, String> entry : idToCatName.entrySet()) {
        idToCategory.put(entry.getKey(), cat_name_to_id.get(entry.getValue()));
    }
}

From source file:kogiri.mapreduce.libra.kmersimilarity_m.KmerSimilarityMap.java

License:Open Source License

private void sumScores(Path outputPath, Configuration conf) throws IOException {
    Path[] resultFiles = KmerSimilarityHelper.getAllKmerSimilarityResultFilePath(conf, outputPath.toString());
    FileSystem fs = outputPath.getFileSystem(conf);

    KmerSimilarityOutputRecord scoreRec = null;
    for (Path resultFile : resultFiles) {
        LOG.info("Reading the scores from " + resultFile.toString());
        FSDataInputStream is = fs.open(resultFile);
        FileStatus status = fs.getFileStatus(resultFile);

        LineRecordReader reader = new LineRecordReader(is, 0, status.getLen(), conf);

        LongWritable off = new LongWritable();
        Text val = new Text();

        while (reader.next(off, val)) {
            if (scoreRec == null) {
                scoreRec = KmerSimilarityOutputRecord.createInstance(val.toString());
            } else {
                KmerSimilarityOutputRecord rec2 = KmerSimilarityOutputRecord.createInstance(val.toString());
                scoreRec.addScore(rec2.getScore());
            }//from w w w.jav a 2 s .  c  o  m
        }

        reader.close();
    }

    double[] accumulatedScore = scoreRec.getScore();

    String resultFilename = KmerSimilarityHelper.makeKmerSimilarityFinalResultFileName();
    Path resultFilePath = new Path(outputPath, resultFilename);

    LOG.info("Creating a final score file : " + resultFilePath.toString());

    FSDataOutputStream os = fs.create(resultFilePath);
    int n = (int) Math.sqrt(accumulatedScore.length);
    for (int i = 0; i < accumulatedScore.length; i++) {
        int x = i / n;
        int y = i % n;

        String k = x + "-" + y;
        String v = Double.toString(accumulatedScore[i]);
        String out = k + "\t" + v + "\n";
        os.write(out.getBytes());
    }

    os.close();
}

From source file:libra.core.kmersimilarity_m.KmerSimilarityMap.java

License:Apache License

private void sumScores(Path outputPath, Configuration conf) throws IOException {
    Path[] resultFiles = KmerSimilarityHelper.getAllKmerSimilarityResultFilePath(conf, outputPath.toString());
    FileSystem fs = outputPath.getFileSystem(conf);

    KmerSimilarityOutputRecord scoreRec = null;
    for (Path resultFile : resultFiles) {
        LOG.info("Reading the scores from " + resultFile.toString());
        FSDataInputStream is = fs.open(resultFile);
        FileStatus status = fs.getFileStatus(resultFile);

        LineRecordReader reader = new LineRecordReader(is, 0, status.getLen(), conf);

        LongWritable off = new LongWritable();
        Text val = new Text();

        while (reader.next(off, val)) {
            if (scoreRec == null) {
                scoreRec = KmerSimilarityOutputRecord.createInstance(val.toString());
            } else {
                KmerSimilarityOutputRecord rec2 = KmerSimilarityOutputRecord.createInstance(val.toString());
                scoreRec.addScore(rec2.getScore());
            }//from w  w w . jav  a2 s.  c  o m
        }

        reader.close();
    }

    double[] accumulatedScore = scoreRec.getScore();

    String resultFilename = KmerSimilarityHelper.makeKmerSimilarityFinalResultFileName();
    Path resultFilePath = new Path(outputPath, resultFilename);

    LOG.info("Creating a final score file : " + resultFilePath.toString());

    FSDataOutputStream os = fs.create(resultFilePath);
    int n = (int) Math.sqrt(accumulatedScore.length);
    for (int i = 0; i < accumulatedScore.length; i++) {
        int x = i / n;
        int y = i % n;

        String k = x + "-" + y;
        String v = Double.toString(accumulatedScore[i]);
        if (x == y) {
            v = Double.toString(1.0);
        }
        String out = k + "\t" + v + "\n";
        os.write(out.getBytes());
    }

    os.close();
}