Example usage for org.apache.hadoop.fs FileSystem open

List of usage examples for org.apache.hadoop.fs FileSystem open

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem open.

Prototype

public FSDataInputStream open(PathHandle fd) throws IOException 

Source Link

Document

Open an FSDataInputStream matching the PathHandle instance.

Usage

From source file:co.nubetech.hiho.merge.TestMergeJob.java

License:Apache License

@Test
public void testMergeByKeyWithKeyValueTextInputFormat() throws Exception {

    final String inputData1 = "A\tMacon Kent,6269 Aenean St.,1-247-399-1051,08253"
            + "\nB\tDale Zamora,521-7792 Mauris Rd.,1-214-625-6970,90510"
            + "\nC\tCharles Wood,525-9709 In Rd.,1-370-528-4758,62714";
    final String inputData2 = "A\tTimon Leonard,716 Ac Ave,1-857-935-3882,62240"
            + "\nD\tMacaulay Jackson,5435 Dui. Avenue,1-770-395-6446,31584"
            + "\nB\tCharles Wood,525-9709 In Rd.,1-370-528-4758,62714";
    createTextFileInHDFS(inputData1, "/input1", "testFile1.txt");
    createTextFileInHDFS(inputData2, "/input2", "testFile2.txt");

    String[] args = new String[] { "-newPath", "/input1", "-oldPath", "/input2", "-mergeBy", "key",
            "-outputPath", "output", "-inputFormat",
            "org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat", "-inputKeyClassName",
            "org.apache.hadoop.io.Text", "-inputValueClassName", "org.apache.hadoop.io.Text", "-outputFormat",
            "co.nubetech.hiho.mapreduce.lib.output.NoKeyOnlyValueOutputFormat" };
    MergeJob job = runMergeJobs(args);//from   w  ww  .  jav a2  s.  c om
    assertEquals(3, job.getTotalRecordsNew());
    assertEquals(3, job.getTotalRecordsOld());
    assertEquals(0, job.getBadRecords());
    assertEquals(4, job.getOutput());

    FileSystem outputFS = getFileSystem();
    Path outputPath = new Path(outputFS.getHomeDirectory(), "output");
    FileStatus[] status = outputFS.listStatus(outputPath, getOutputPathFilter());
    assertTrue(outputFS.exists(outputPath));
    List<String> expectedOutput = new ArrayList<String>();
    expectedOutput.add("Macon Kent,6269 Aenean St.,1-247-399-1051,08253");
    expectedOutput.add("Dale Zamora,521-7792 Mauris Rd.,1-214-625-6970,90510");
    expectedOutput.add("Charles Wood,525-9709 In Rd.,1-370-528-4758,62714");
    expectedOutput.add("Macaulay Jackson,5435 Dui. Avenue,1-770-395-6446,31584");
    int count = 0;
    for (FileStatus fileStat : status) {
        logger.debug("File status is " + fileStat.getPath() + " and is it a dir? " + fileStat.isDirectory());
        FSDataInputStream in = outputFS.open(fileStat.getPath());
        String line = null;
        while ((line = in.readLine()) != null) {
            logger.debug("Output is " + line);
            assertTrue("Matched output " + line, expectedOutput.contains(line));
            expectedOutput.remove(line);
            count++;
        }
        in.close();
    }
    assertEquals(4, count);
}

From source file:co.nubetech.hiho.merge.TestMergeJob.java

License:Apache License

@Test
public void testMergeByValueWithSequenceFileAsTextInputFormat() throws Exception {
    HashMap<IntWritable, Text> inputData1 = new HashMap<IntWritable, Text>();
    inputData1.put(new IntWritable(1), new Text("Macon Kent,6269 Aenean St.,1-247-399-1051,08253"));
    inputData1.put(new IntWritable(2), new Text("Dale Zamora,521-7792 Mauris Rd.,1-214-625-6970,90510"));
    inputData1.put(new IntWritable(3), new Text("Charles Wood,525-9709 In Rd.,1-370-528-4758,62714"));
    createSequenceFileInHdfs(inputData1, "/input1", "testFile1.seq");

    HashMap<IntWritable, Text> inputData2 = new HashMap<IntWritable, Text>();
    inputData2.put(new IntWritable(1), new Text("Timon Leonard,716 Ac Ave,1-857-935-3882,62240"));
    inputData2.put(new IntWritable(2), new Text("Macaulay Jackson,5435 Dui. Avenue,1-770-395-6446,31584"));
    inputData2.put(new IntWritable(4), new Text("Charles Wood,525-9709 In Rd.,1-370-528-4758,62714"));
    createSequenceFileInHdfs(inputData2, "/input2", "testFile2.seq");

    String[] args = new String[] { "-newPath", "/input1", "-oldPath", "/input2", "-mergeBy", "value",
            "-outputPath", "output", "-inputFormat",
            "org.apache.hadoop.mapreduce.lib.input.SequenceFileAsTextInputFormat", "-inputKeyClassName",
            "org.apache.hadoop.io.Text", "-inputValueClassName", "org.apache.hadoop.io.Text", "-outputFormat",
            "co.nubetech.hiho.mapreduce.lib.output.NoKeyOnlyValueOutputFormat" };
    MergeJob job = runMergeJobs(args);/*from   w  ww  . ja  va2 s.  c  om*/
    assertEquals(3, job.getTotalRecordsNew());
    assertEquals(3, job.getTotalRecordsOld());
    assertEquals(0, job.getBadRecords());
    assertEquals(5, job.getOutput());

    FileSystem outputFS = getFileSystem();
    Path outputPath = new Path(outputFS.getHomeDirectory(), "output");
    FileStatus[] status = outputFS.listStatus(outputPath, getOutputPathFilter());
    assertTrue(outputFS.exists(outputPath));
    List<String> expectedOutput = new ArrayList<String>();
    expectedOutput.add("Macon Kent,6269 Aenean St.,1-247-399-1051,08253");
    expectedOutput.add("Dale Zamora,521-7792 Mauris Rd.,1-214-625-6970,90510");
    expectedOutput.add("Charles Wood,525-9709 In Rd.,1-370-528-4758,62714");
    expectedOutput.add("Timon Leonard,716 Ac Ave,1-857-935-3882,62240");
    expectedOutput.add("Macaulay Jackson,5435 Dui. Avenue,1-770-395-6446,31584");
    int count = 0;
    for (FileStatus fileStat : status) {
        logger.debug("File status is " + fileStat.getPath() + " and is it a dir? " + fileStat.isDirectory());
        FSDataInputStream in = outputFS.open(fileStat.getPath());
        String line = null;
        while ((line = in.readLine()) != null) {
            logger.debug("Output is " + line);
            assertTrue("Matched output " + line, expectedOutput.contains(line));
            expectedOutput.remove(line);
            count++;
        }
        in.close();
    }
    assertEquals(5, count);
}

From source file:coldstorage.io.Reader.java

License:Apache License

public static void main(String[] args) throws IOException {

    List<Long> idsToFind = new ArrayList<Long>();
    int maxId = 100000000;
    Random random = new Random(1);
    for (int i = 0; i < 1000; i++) {
        long id = (long) random.nextInt(maxId);
        //      System.out.println(id);
        idsToFind.add(id);//from   w  w w  .j a  va2 s .co m
    }

    // idsToFind.clear();
    // idsToFind.add(58998000L);

    //    Path pathData = new Path("./out/data.avro");
    //    Path pathIndex = new Path("./out/data.index");

    Path pathData = new Path("hdfs://localhost:9000/avro/out/data.avro");
    Path pathIndex = new Path("hdfs://localhost:9000/avro/out/data.index");

    Configuration configuration = new Configuration();
    FileSystem fileSystem = pathData.getFileSystem(configuration);
    FileStatus indexFileStatus = fileSystem.getFileStatus(pathIndex);
    FileStatus dataFileStatus = fileSystem.getFileStatus(pathData);
    FSDataInputStream indexInputStream = fileSystem.open(pathIndex);
    FSDataInputStream dataInputStream = fileSystem.open(pathData);

    AvroFSInput fsInput = new AvroFSInput(dataInputStream, dataFileStatus.getLen());
    GenericDatumReader<GenericRecord> gdr = new GenericDatumReader<GenericRecord>();
    DataFileReader<GenericRecord> reader = new DataFileReader<GenericRecord>(fsInput, gdr);

    List<IndexKey> list = getList(indexInputStream, indexFileStatus.getLen());

    for (Long idToFind : idsToFind) {
        long t1 = System.nanoTime();
        GenericRecord lookupRecord = lookupRecord(reader, list, idToFind);
        long t2 = System.nanoTime();
        System.out.println("Found [" + idToFind + "] in [" + (t2 - t1) / 1000000.0 + " ms]:" + lookupRecord);
    }
}

From source file:ColumnStorage.ColumnProject.java

License:Open Source License

void loadColmnInfoFromNavigator(FileSystem fs, Path naviPath) throws Exception {
    FSDataInputStream in = fs.open(naviPath);

    int magic = in.readInt();
    if (magic != ConstVar.NaviMagic) {
        throw new SEException.ErrorFileFormat("invalid navi magic:" + magic + ",file:" + naviPath.toString());
    }/* ww  w .  j  av a  2  s. c  o  m*/

    short infoNum = in.readShort();
    for (int i = 0; i < infoNum; i++) {
        infos.add(loadColumnInfo(in));
    }
}

From source file:com.acme.io.JsonLoader.java

License:Apache License

/**
 * Get a schema for the data to be loaded.  
 * @param location Location as returned by 
 * {@link LoadFunc#relativeToAbsolutePath(String, org.apache.hadoop.fs.Path)}
 * @param job The {@link Job} object - this should be used only to obtain 
 * cluster properties through {@link Job#getConfiguration()} and not to
 * set/query any runtime job information.  
 * @return schema for the data to be loaded. This schema should represent
 * all tuples of the returned data.  If the schema is unknown or it is
 * not possible to return a schema that represents all returned data,
 * then null should be returned. The schema should not be affected by
 * pushProjection, ie. getSchema should always return the original schema
 * even after pushProjection/* w  ww  .j a  v  a  2s  .co m*/
 * @throws IOException if an exception occurs while determining the schema
 */
public ResourceSchema getSchema(String location, Job job) throws IOException {
    // Open the schema file and read the schema
    // Get an HDFS handle.
    FileSystem fs = FileSystem.get(job.getConfiguration());
    DataInputStream in = fs.open(new Path(location + "/_schema"));
    String line = in.readLine();
    in.close();

    // Parse the schema
    ResourceSchema s = new ResourceSchema(Utils.getSchemaFromString(line));
    if (s == null) {
        throw new IOException("Unable to parse schema found in file " + location + "/_schema");
    }

    // Now that we have determined the schema, store it in our
    // UDFContext properties object so we have it when we need it on the
    // backend
    UDFContext udfc = UDFContext.getUDFContext();
    Properties p = udfc.getUDFProperties(this.getClass(), new String[] { udfcSignature });
    p.setProperty("pig.jsonloader.schema", line);

    return s;
}

From source file:com.acme.marketing.MetroResolver.java

License:Apache License

public String exec(Tuple input) throws IOException {
    if (lookup == null) {
        // We have not been initialized yet, so do it now.
        lookup = new HashMap<String, String>();

        // Get an instance of the HDFS FileSystem class so
        // we can read a file from HDFS.  We need a copy of
        // our configuration to do that.
        // Read the configuration from the UDFContext
        FileSystem fs = FileSystem.get(UDFContext.getUDFContext().getJobConf());
        DataInputStream in = fs.open(new Path(lookupFile));
        String line;/* w w w  . j a  va  2  s .c  o m*/
        while ((line = in.readLine()) != null) {
            String[] toks = new String[2];
            toks = line.split(":", 2);
            lookup.put(toks[0], toks[1]);
        }
        in.close();
    }
    return lookup.get((String) input.get(0));
}

From source file:com.alectenharmsel.research.WholeBlockRecordReader.java

License:Apache License

public boolean nextKeyValue() throws IOException, InterruptedException {
    if (!processed) {
        System.err.println("start is " + start);
        Path file = fileSplit.getPath();
        String tmp = file.toString();
        System.err.println("File: " + tmp);
        currKey.set(tmp);/*from   ww w .j a va 2s.  c  o  m*/
        System.err.println("Reached this point");
        FileSystem fs = file.getFileSystem(conf);
        System.err.println("fs blocksize: " + fs.getDefaultBlockSize(file));
        System.err.println("linecount blocksize: " + blockSize);
        byte[] contents;
        FSDataInputStream in = null;
        try {
            in = fs.open(file);
            System.err.println("getPos(): " + in.getPos());

            if ((start + blockSize) > fileLength) {
                blockSize = (int) (fileLength - start);
                processed = true;
            }

            contents = new byte[blockSize];

            //IOUtils.readFully(in, contents, start, blockSize);
            //IOUtils.readFully(in, contents, 0, blockSize);
            in.readFully(start, contents);

            start += blockSize;

            currValue.set(contents);
        } finally {
            IOUtils.closeStream(in);
        }
        return true;
    }
    return false;
}

From source file:com.alexholmes.hadooputils.io.FileUtils.java

License:Apache License

/**
 * Read the contents of the supplied file into a list.
 *
 * @param fs a Hadoop file system//from  w  ww .j ava 2 s .  c  o  m
 * @param p  the file path
 * @return array of lines in the file
 * @throws java.io.IOException if something goes wrong
 */
public static List<String> readLines(final FileSystem fs, final Path p) throws IOException {
    InputStream stream = fs.open(p);
    try {
        return IOUtils.readLines(stream);
    } finally {
        stream.close();
    }
}

From source file:com.alexholmes.hadooputils.sort.DelimitedLineRecordReader.java

License:Apache License

protected void initialize(Configuration job, FileSplit split) throws IOException {
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();//from  ww  w .  java2  s .  co  m
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    String rowDelim = job.get("textinputformat.record.delimiter", null);
    if (codec != null) {
        if (rowDelim != null) {
            byte[] hexcode = SortConfig.getHexDelimiter(rowDelim);
            in = new DelimitedLineReader(codec.createInputStream(fileIn), job,
                    (hexcode != null) ? hexcode : rowDelim.getBytes());
        } else {
            in = new DelimitedLineReader(codec.createInputStream(fileIn), job);
        }
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        if (rowDelim != null) {
            byte[] hexcode = SortConfig.getHexDelimiter(rowDelim);
            in = new DelimitedLineReader(fileIn, job, (hexcode != null) ? hexcode : rowDelim.getBytes());
        } else {
            in = new DelimitedLineReader(fileIn, job);
        }
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:com.alexholmes.hadooputils.sort.LzoDelimitedLineRecordReader.java

License:Apache License

@Override
protected void initialize(Configuration job, FileSplit split) throws IOException {
    start = split.getStart();/*  www.jav a2 s  . c o m*/
    end = start + split.getLength();
    final Path file = split.getPath();

    FileSystem fs = file.getFileSystem(job);
    CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);
    if (codec == null) {
        throw new IOException("No codec for file " + file + " not found, cannot run");
    }

    // open the file and seek to the start of the split
    fileIn = fs.open(split.getPath());

    // creates input stream and also reads the file header
    String rowDelim = job.get("textinputformat.record.delimiter", null);
    if (rowDelim != null) {
        byte[] hexcode = SortConfig.getHexDelimiter(rowDelim);
        in = new DelimitedLineReader(fileIn, job, (hexcode != null) ? hexcode : rowDelim.getBytes());
    } else {
        in = new DelimitedLineReader(codec.createInputStream(fileIn), job);
    }

    if (start != 0) {
        fileIn.seek(start);

        // read and ignore the first line
        in.readLine(new Text());
        start = fileIn.getPos();
    }

    this.pos = start;
}