List of usage examples for org.apache.hadoop.fs FileSystem open
public FSDataInputStream open(PathHandle fd) throws IOException
From source file:co.nubetech.hiho.merge.TestMergeJob.java
License:Apache License
@Test public void testMergeByKeyWithKeyValueTextInputFormat() throws Exception { final String inputData1 = "A\tMacon Kent,6269 Aenean St.,1-247-399-1051,08253" + "\nB\tDale Zamora,521-7792 Mauris Rd.,1-214-625-6970,90510" + "\nC\tCharles Wood,525-9709 In Rd.,1-370-528-4758,62714"; final String inputData2 = "A\tTimon Leonard,716 Ac Ave,1-857-935-3882,62240" + "\nD\tMacaulay Jackson,5435 Dui. Avenue,1-770-395-6446,31584" + "\nB\tCharles Wood,525-9709 In Rd.,1-370-528-4758,62714"; createTextFileInHDFS(inputData1, "/input1", "testFile1.txt"); createTextFileInHDFS(inputData2, "/input2", "testFile2.txt"); String[] args = new String[] { "-newPath", "/input1", "-oldPath", "/input2", "-mergeBy", "key", "-outputPath", "output", "-inputFormat", "org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat", "-inputKeyClassName", "org.apache.hadoop.io.Text", "-inputValueClassName", "org.apache.hadoop.io.Text", "-outputFormat", "co.nubetech.hiho.mapreduce.lib.output.NoKeyOnlyValueOutputFormat" }; MergeJob job = runMergeJobs(args);//from w ww . jav a2 s. c om assertEquals(3, job.getTotalRecordsNew()); assertEquals(3, job.getTotalRecordsOld()); assertEquals(0, job.getBadRecords()); assertEquals(4, job.getOutput()); FileSystem outputFS = getFileSystem(); Path outputPath = new Path(outputFS.getHomeDirectory(), "output"); FileStatus[] status = outputFS.listStatus(outputPath, getOutputPathFilter()); assertTrue(outputFS.exists(outputPath)); List<String> expectedOutput = new ArrayList<String>(); expectedOutput.add("Macon Kent,6269 Aenean St.,1-247-399-1051,08253"); expectedOutput.add("Dale Zamora,521-7792 Mauris Rd.,1-214-625-6970,90510"); expectedOutput.add("Charles Wood,525-9709 In Rd.,1-370-528-4758,62714"); expectedOutput.add("Macaulay Jackson,5435 Dui. Avenue,1-770-395-6446,31584"); int count = 0; for (FileStatus fileStat : status) { logger.debug("File status is " + fileStat.getPath() + " and is it a dir? " + fileStat.isDirectory()); FSDataInputStream in = outputFS.open(fileStat.getPath()); String line = null; while ((line = in.readLine()) != null) { logger.debug("Output is " + line); assertTrue("Matched output " + line, expectedOutput.contains(line)); expectedOutput.remove(line); count++; } in.close(); } assertEquals(4, count); }
From source file:co.nubetech.hiho.merge.TestMergeJob.java
License:Apache License
@Test public void testMergeByValueWithSequenceFileAsTextInputFormat() throws Exception { HashMap<IntWritable, Text> inputData1 = new HashMap<IntWritable, Text>(); inputData1.put(new IntWritable(1), new Text("Macon Kent,6269 Aenean St.,1-247-399-1051,08253")); inputData1.put(new IntWritable(2), new Text("Dale Zamora,521-7792 Mauris Rd.,1-214-625-6970,90510")); inputData1.put(new IntWritable(3), new Text("Charles Wood,525-9709 In Rd.,1-370-528-4758,62714")); createSequenceFileInHdfs(inputData1, "/input1", "testFile1.seq"); HashMap<IntWritable, Text> inputData2 = new HashMap<IntWritable, Text>(); inputData2.put(new IntWritable(1), new Text("Timon Leonard,716 Ac Ave,1-857-935-3882,62240")); inputData2.put(new IntWritable(2), new Text("Macaulay Jackson,5435 Dui. Avenue,1-770-395-6446,31584")); inputData2.put(new IntWritable(4), new Text("Charles Wood,525-9709 In Rd.,1-370-528-4758,62714")); createSequenceFileInHdfs(inputData2, "/input2", "testFile2.seq"); String[] args = new String[] { "-newPath", "/input1", "-oldPath", "/input2", "-mergeBy", "value", "-outputPath", "output", "-inputFormat", "org.apache.hadoop.mapreduce.lib.input.SequenceFileAsTextInputFormat", "-inputKeyClassName", "org.apache.hadoop.io.Text", "-inputValueClassName", "org.apache.hadoop.io.Text", "-outputFormat", "co.nubetech.hiho.mapreduce.lib.output.NoKeyOnlyValueOutputFormat" }; MergeJob job = runMergeJobs(args);/*from w ww . ja va2 s. c om*/ assertEquals(3, job.getTotalRecordsNew()); assertEquals(3, job.getTotalRecordsOld()); assertEquals(0, job.getBadRecords()); assertEquals(5, job.getOutput()); FileSystem outputFS = getFileSystem(); Path outputPath = new Path(outputFS.getHomeDirectory(), "output"); FileStatus[] status = outputFS.listStatus(outputPath, getOutputPathFilter()); assertTrue(outputFS.exists(outputPath)); List<String> expectedOutput = new ArrayList<String>(); expectedOutput.add("Macon Kent,6269 Aenean St.,1-247-399-1051,08253"); expectedOutput.add("Dale Zamora,521-7792 Mauris Rd.,1-214-625-6970,90510"); expectedOutput.add("Charles Wood,525-9709 In Rd.,1-370-528-4758,62714"); expectedOutput.add("Timon Leonard,716 Ac Ave,1-857-935-3882,62240"); expectedOutput.add("Macaulay Jackson,5435 Dui. Avenue,1-770-395-6446,31584"); int count = 0; for (FileStatus fileStat : status) { logger.debug("File status is " + fileStat.getPath() + " and is it a dir? " + fileStat.isDirectory()); FSDataInputStream in = outputFS.open(fileStat.getPath()); String line = null; while ((line = in.readLine()) != null) { logger.debug("Output is " + line); assertTrue("Matched output " + line, expectedOutput.contains(line)); expectedOutput.remove(line); count++; } in.close(); } assertEquals(5, count); }
From source file:coldstorage.io.Reader.java
License:Apache License
public static void main(String[] args) throws IOException { List<Long> idsToFind = new ArrayList<Long>(); int maxId = 100000000; Random random = new Random(1); for (int i = 0; i < 1000; i++) { long id = (long) random.nextInt(maxId); // System.out.println(id); idsToFind.add(id);//from w w w .j a va2 s .co m } // idsToFind.clear(); // idsToFind.add(58998000L); // Path pathData = new Path("./out/data.avro"); // Path pathIndex = new Path("./out/data.index"); Path pathData = new Path("hdfs://localhost:9000/avro/out/data.avro"); Path pathIndex = new Path("hdfs://localhost:9000/avro/out/data.index"); Configuration configuration = new Configuration(); FileSystem fileSystem = pathData.getFileSystem(configuration); FileStatus indexFileStatus = fileSystem.getFileStatus(pathIndex); FileStatus dataFileStatus = fileSystem.getFileStatus(pathData); FSDataInputStream indexInputStream = fileSystem.open(pathIndex); FSDataInputStream dataInputStream = fileSystem.open(pathData); AvroFSInput fsInput = new AvroFSInput(dataInputStream, dataFileStatus.getLen()); GenericDatumReader<GenericRecord> gdr = new GenericDatumReader<GenericRecord>(); DataFileReader<GenericRecord> reader = new DataFileReader<GenericRecord>(fsInput, gdr); List<IndexKey> list = getList(indexInputStream, indexFileStatus.getLen()); for (Long idToFind : idsToFind) { long t1 = System.nanoTime(); GenericRecord lookupRecord = lookupRecord(reader, list, idToFind); long t2 = System.nanoTime(); System.out.println("Found [" + idToFind + "] in [" + (t2 - t1) / 1000000.0 + " ms]:" + lookupRecord); } }
From source file:ColumnStorage.ColumnProject.java
License:Open Source License
void loadColmnInfoFromNavigator(FileSystem fs, Path naviPath) throws Exception { FSDataInputStream in = fs.open(naviPath); int magic = in.readInt(); if (magic != ConstVar.NaviMagic) { throw new SEException.ErrorFileFormat("invalid navi magic:" + magic + ",file:" + naviPath.toString()); }/* ww w . j av a 2 s. c o m*/ short infoNum = in.readShort(); for (int i = 0; i < infoNum; i++) { infos.add(loadColumnInfo(in)); } }
From source file:com.acme.io.JsonLoader.java
License:Apache License
/** * Get a schema for the data to be loaded. * @param location Location as returned by * {@link LoadFunc#relativeToAbsolutePath(String, org.apache.hadoop.fs.Path)} * @param job The {@link Job} object - this should be used only to obtain * cluster properties through {@link Job#getConfiguration()} and not to * set/query any runtime job information. * @return schema for the data to be loaded. This schema should represent * all tuples of the returned data. If the schema is unknown or it is * not possible to return a schema that represents all returned data, * then null should be returned. The schema should not be affected by * pushProjection, ie. getSchema should always return the original schema * even after pushProjection/* w ww .j a v a 2s .co m*/ * @throws IOException if an exception occurs while determining the schema */ public ResourceSchema getSchema(String location, Job job) throws IOException { // Open the schema file and read the schema // Get an HDFS handle. FileSystem fs = FileSystem.get(job.getConfiguration()); DataInputStream in = fs.open(new Path(location + "/_schema")); String line = in.readLine(); in.close(); // Parse the schema ResourceSchema s = new ResourceSchema(Utils.getSchemaFromString(line)); if (s == null) { throw new IOException("Unable to parse schema found in file " + location + "/_schema"); } // Now that we have determined the schema, store it in our // UDFContext properties object so we have it when we need it on the // backend UDFContext udfc = UDFContext.getUDFContext(); Properties p = udfc.getUDFProperties(this.getClass(), new String[] { udfcSignature }); p.setProperty("pig.jsonloader.schema", line); return s; }
From source file:com.acme.marketing.MetroResolver.java
License:Apache License
public String exec(Tuple input) throws IOException { if (lookup == null) { // We have not been initialized yet, so do it now. lookup = new HashMap<String, String>(); // Get an instance of the HDFS FileSystem class so // we can read a file from HDFS. We need a copy of // our configuration to do that. // Read the configuration from the UDFContext FileSystem fs = FileSystem.get(UDFContext.getUDFContext().getJobConf()); DataInputStream in = fs.open(new Path(lookupFile)); String line;/* w w w . j a va 2 s .c o m*/ while ((line = in.readLine()) != null) { String[] toks = new String[2]; toks = line.split(":", 2); lookup.put(toks[0], toks[1]); } in.close(); } return lookup.get((String) input.get(0)); }
From source file:com.alectenharmsel.research.WholeBlockRecordReader.java
License:Apache License
public boolean nextKeyValue() throws IOException, InterruptedException { if (!processed) { System.err.println("start is " + start); Path file = fileSplit.getPath(); String tmp = file.toString(); System.err.println("File: " + tmp); currKey.set(tmp);/*from ww w .j a va 2s. c o m*/ System.err.println("Reached this point"); FileSystem fs = file.getFileSystem(conf); System.err.println("fs blocksize: " + fs.getDefaultBlockSize(file)); System.err.println("linecount blocksize: " + blockSize); byte[] contents; FSDataInputStream in = null; try { in = fs.open(file); System.err.println("getPos(): " + in.getPos()); if ((start + blockSize) > fileLength) { blockSize = (int) (fileLength - start); processed = true; } contents = new byte[blockSize]; //IOUtils.readFully(in, contents, start, blockSize); //IOUtils.readFully(in, contents, 0, blockSize); in.readFully(start, contents); start += blockSize; currValue.set(contents); } finally { IOUtils.closeStream(in); } return true; } return false; }
From source file:com.alexholmes.hadooputils.io.FileUtils.java
License:Apache License
/** * Read the contents of the supplied file into a list. * * @param fs a Hadoop file system//from w ww .j ava 2 s . c o m * @param p the file path * @return array of lines in the file * @throws java.io.IOException if something goes wrong */ public static List<String> readLines(final FileSystem fs, final Path p) throws IOException { InputStream stream = fs.open(p); try { return IOUtils.readLines(stream); } finally { stream.close(); } }
From source file:com.alexholmes.hadooputils.sort.DelimitedLineRecordReader.java
License:Apache License
protected void initialize(Configuration job, FileSplit split) throws IOException { this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart();//from ww w . java2 s . co m end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; String rowDelim = job.get("textinputformat.record.delimiter", null); if (codec != null) { if (rowDelim != null) { byte[] hexcode = SortConfig.getHexDelimiter(rowDelim); in = new DelimitedLineReader(codec.createInputStream(fileIn), job, (hexcode != null) ? hexcode : rowDelim.getBytes()); } else { in = new DelimitedLineReader(codec.createInputStream(fileIn), job); } end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } if (rowDelim != null) { byte[] hexcode = SortConfig.getHexDelimiter(rowDelim); in = new DelimitedLineReader(fileIn, job, (hexcode != null) ? hexcode : rowDelim.getBytes()); } else { in = new DelimitedLineReader(fileIn, job); } } if (skipFirstLine) { // skip first line and re-establish "start". start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }
From source file:com.alexholmes.hadooputils.sort.LzoDelimitedLineRecordReader.java
License:Apache License
@Override protected void initialize(Configuration job, FileSplit split) throws IOException { start = split.getStart();/* www.jav a2 s . c o m*/ end = start + split.getLength(); final Path file = split.getPath(); FileSystem fs = file.getFileSystem(job); CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); if (codec == null) { throw new IOException("No codec for file " + file + " not found, cannot run"); } // open the file and seek to the start of the split fileIn = fs.open(split.getPath()); // creates input stream and also reads the file header String rowDelim = job.get("textinputformat.record.delimiter", null); if (rowDelim != null) { byte[] hexcode = SortConfig.getHexDelimiter(rowDelim); in = new DelimitedLineReader(fileIn, job, (hexcode != null) ? hexcode : rowDelim.getBytes()); } else { in = new DelimitedLineReader(codec.createInputStream(fileIn), job); } if (start != 0) { fileIn.seek(start); // read and ignore the first line in.readLine(new Text()); start = fileIn.getPos(); } this.pos = start; }