List of usage examples for org.apache.hadoop.fs FileSystem open
public FSDataInputStream open(PathHandle fd) throws IOException
From source file:com.cloudera.recordbreaker.analyzer.TextRegexpDataDescriptor.java
License:Open Source License
public void prepareAvroFile(FileSystem srcFs, FileSystem dstFs, Path dst, Configuration conf) throws IOException { SchemaDescriptor sd = this.getSchemaDescriptor().get(0); List<Schema> unionFreeSchemas = SchemaUtils.getUnionFreeSchemasByFrequency(sd, 100, true); Schema schema = unionFreeSchemas.get(0); // Open stream to write out Avro contents DatumWriter<GenericRecord> writer = new GenericDatumWriter<GenericRecord>(schema); DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(writer); dataFileWriter.create(schema, dstFs.create(dst, true)); try {// w w w .j a v a 2s . c om BufferedReader in = new BufferedReader(new InputStreamReader(srcFs.open(getFilename()))); try { String rowStr = null; GenericData.Record rowRecord = null; while ((rowStr = in.readLine()) != null) { for (int i = 0; i < regexps.size(); i++) { Pattern curPattern = regexps.get(i); Schema curSchema = localschemas.get(i); Matcher curMatcher = curPattern.matcher(rowStr); if (curMatcher.find()) { // Create Avro record here rowRecord = new GenericData.Record(curSchema); List<Schema.Field> curFields = curSchema.getFields(); for (int j = 0; j < curMatcher.groupCount(); j++) { Schema.Field curField = curFields.get(j); String fieldName = curField.name(); Schema fieldType = curField.schema(); String rawFieldValue = curMatcher.group(j + 1); Object fieldValue = null; if (fieldType.getType() == Schema.Type.INT) { fieldValue = Integer.parseInt(rawFieldValue); } else if (fieldType.getType() == Schema.Type.FLOAT) { fieldValue = Float.parseFloat(rawFieldValue); } else if (fieldType.getType() == Schema.Type.STRING) { fieldValue = rawFieldValue; } if (fieldValue != null) { rowRecord.put(fieldName, fieldValue); } } if (rowRecord.getSchema().toString().hashCode() == schema.toString().hashCode()) { dataFileWriter.append(rowRecord); } } } } } finally { in.close(); } } finally { dataFileWriter.close(); } }
From source file:com.cloudera.recordbreaker.analyzer.UnknownTextDataDescriptor.java
License:Open Source License
public static boolean isTextData(FileSystem fs, Path p) { try {// w w w .j a va2s. c o m BufferedInputStream in = new BufferedInputStream(fs.open(p)); try { byte buf[] = new byte[1024]; int numBytes = in.read(buf); if (numBytes < 0) { return false; } int numASCIIChars = 0; for (int i = 0; i < numBytes; i++) { if (buf[i] >= 32 && buf[i] < 128) { numASCIIChars++; } } return ((numASCIIChars / (1.0 * numBytes)) > asciiThreshold); } finally { in.close(); } } catch (IOException iex) { return false; } }
From source file:com.cloudera.recordbreaker.analyzer.UnknownTextDataDescriptor.java
License:Open Source License
public UnknownTextDataDescriptor(FileSystem fs, Path p, File schemaDictDir) throws IOException { super(p, fs, TEXTDATA_TYPE); this.schemaDictDir = schemaDictDir; UnknownTextSchemaDescriptor tsd = new UnknownTextSchemaDescriptor(this); // Test if this schema descriptor can parse the file boolean hasLatentStructure = false; int numTuples = 0; for (Iterator it = tsd.getIterator(); it.hasNext();) { numTuples++;/* ww w . ja va2s . c o m*/ it.next(); } int numLines = 0; BufferedReader in = new BufferedReader(new InputStreamReader(fs.open(p))); while (in.readLine() != null) { numLines++; } numTuples = Math.min(numTuples, UnknownTextSchemaDescriptor.MAX_LINES); numLines = Math.min(numLines, UnknownTextSchemaDescriptor.MAX_LINES); if ((numTuples / (1.0 * numLines)) < TUPLE_PCT) { throw new IOException("Cannot parse structured text data"); } this.schemas.add(new UnknownTextSchemaDescriptor(this)); }
From source file:com.cloudera.recordbreaker.analyzer.UnknownTextSchemaDescriptor.java
License:Open Source License
void computeSchema() throws IOException { this.randId = new Random().nextInt(); LearnStructure ls = new LearnStructure(); FileSystem fs = FSAnalyzer.getInstance().getFS(); FileSystem localFS = FileSystem.getLocal(new Configuration()); Path inputPath = dd.getFilename(); File workingParserFile = File.createTempFile("textdesc", "typetree", null); File workingSchemaFile = File.createTempFile("textdesc", "schema", null); ls.inferRecordFormat(fs, inputPath, localFS, new Path(workingSchemaFile.getCanonicalPath()), new Path(workingParserFile.getCanonicalPath()), null, null, false, MAX_LINES); this.schema = Schema.parse(workingSchemaFile); DataInputStream in = new DataInputStream(localFS.open(new Path(workingParserFile.getCanonicalPath()))); try {//from ww w.j a v a 2 s. c o m this.typeTree = InferredType.readType(in); } catch (IOException iex) { iex.printStackTrace(); throw iex; } finally { in.close(); } //System.err.println("Recovered unknowntext schema: " + schema); }
From source file:com.cloudera.recordbreaker.hive.borrowed.AvroSerdeUtils.java
License:Apache License
protected static Schema getSchemaFromHDFS(String schemaHDFSUrl, Configuration conf) throws IOException { FileSystem fs = FileSystem.get(conf); FSDataInputStream in = null;//from w ww . ja v a2s . co m try { in = fs.open(new Path(schemaHDFSUrl)); Schema s = Schema.parse(in); return s; } finally { if (in != null) in.close(); } }
From source file:com.cloudera.recordbreaker.learnstructure.LearnStructure.java
License:Open Source License
/** *//* w w w.j a v a2 s . c o m*/ public void inferRecordFormat(FileSystem fs, Path p, FileSystem fs2, Path schemaFile, Path parseTreeFile, Path jsonDataFile, Path avroDataFile, boolean verbose, int maxLines) throws IOException { // Store parse errors and results List<Integer> unparseableLineNos = new ArrayList<Integer>(); List<String> unparseableStrs = new ArrayList<String>(); List<Integer> parseableLineNos = new ArrayList<Integer>(); List<List<Token.AbstractToken>> allChunks = new ArrayList<List<Token.AbstractToken>>(); // // Transform the text into a list of "chunks". A single chunk corresponds to a line of text. A chunk is a list of Tokens. // long startRead = System.currentTimeMillis(); BufferedReader in = new BufferedReader(new InputStreamReader(fs.open(p))); try { String s = in.readLine(); int lineno = 0; while (s != null) { if (maxLines >= 0 && lineno >= maxLines) { break; } List<Token.AbstractToken> chunkToks = Tokenizer.tokenize(s); if (chunkToks != null) { allChunks.add(chunkToks); parseableLineNos.add(lineno); } else { unparseableStrs.add(s); unparseableLineNos.add(lineno); } s = in.readLine(); lineno++; } } finally { in.close(); } // // Infer type structure from the tokenized chunks // long start = System.currentTimeMillis(); InferredType typeTree = TypeInference.infer(allChunks); long end = System.currentTimeMillis(); double loadTime = (start - startRead) / 1000.0; double inferTime = (end - start) / 1000.0; double totalTime = (end - startRead) / 1000.0; if (verbose) { System.err.println("Number of chunks: " + allChunks.size()); System.err.println("Elapsed load time: " + loadTime); System.err.println("Elapsed inference time: " + inferTime); System.err.println("Total execution time: " + totalTime); } // // The existing type tree is now correct, but could probably be more succinct. // We can now improve/rewrite it. // // // Should every top-level type be ARRAY, so as to allow repeated log lines? // Or does the Avro format allow an implict top-level repeating structure? // // // Dump the results. We emit: // 1) A JSON/Avro schema // 2) A serialized parser program that can consume data and emit Avro files using the given schema // Schema s = typeTree.getAvroSchema(); if (schemaFile != null) { BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fs2.create(schemaFile))); try { out.write(s.toString(true)); } finally { out.close(); } } if (parseTreeFile != null) { DataOutputStream outd = new DataOutputStream(new BufferedOutputStream(fs2.create(parseTreeFile))); try { typeTree.write(outd); } finally { outd.close(); } } // // Apply the typetree's parser. // if (jsonDataFile != null) { Schema schema = typeTree.getAvroSchema(); GenericDatumWriter jsonGDWriter = new GenericDatumWriter(schema); BufferedOutputStream outJson = new BufferedOutputStream(fs2.create(jsonDataFile)); JsonEncoder encoder = EncoderFactory.get().jsonEncoder(schema, outJson); try { in = new BufferedReader(new InputStreamReader(fs.open(p))); try { String str = in.readLine(); while (str != null) { GenericContainer gct = typeTree.parse(str); if (gct != null) { jsonGDWriter.write(gct, encoder); } str = in.readLine(); } } finally { in.close(); } } finally { encoder.flush(); outJson.close(); } } if (avroDataFile != null) { int numGoodParses = 0; int lineno = 0; Schema schema = typeTree.getAvroSchema(); GenericDatumWriter gdWriter = new GenericDatumWriter(schema); DataFileWriter outData = new DataFileWriter(gdWriter); outData = outData.create(schema, fs2.create(avroDataFile)); try { in = new BufferedReader(new InputStreamReader(fs.open(p))); try { String str = in.readLine(); while (str != null) { GenericContainer gct = typeTree.parse(str); if (gct != null) { numGoodParses++; outData.append(gct); } else { if (verbose) { System.err.println("unparsed line: '" + str + "'"); } } str = in.readLine(); lineno++; } } finally { in.close(); } } finally { outData.close(); } if (verbose) { System.err.println(); System.err.println("Total # input lines: " + lineno); System.err.println("Total # lines parsed correctly: " + numGoodParses); } } }
From source file:com.cloudera.recordservice.mapreduce.testapps.RecordCount.java
License:Apache License
public static long countRecords(String path) throws IOException { String output = TestUtil.getTempDirectory(); Path inputPath = new Path(path); Path outputPath = new Path(output); JobConf conf = new JobConf(RecordCount.class); conf.setJobName("recordcount"); conf.setOutputKeyClass(NullWritable.class); conf.setOutputValueClass(LongWritable.class); conf.setInt("mapreduce.job.reduces", 1); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(com.cloudera.recordservice.mapred.TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, outputPath); JobClient.runJob(conf);/* w w w .j a v a2s . c o m*/ // Read the result and return it. Since we set the number of reducers to 1, // there is always just one file containing the value. FileSystem fs = outputPath.getFileSystem(conf); FSDataInputStream resultStream = fs.open(new Path(output + "/part-00000")); byte[] bytes = new byte[16]; int length = resultStream.read(bytes); String result = new String(bytes, 0, length).trim(); return Long.parseLong(result); }
From source file:com.cloudera.sa.ExcelRecordReader.java
License:Apache License
@Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit fileSplit = (FileSplit) split; Configuration conf = context.getConfiguration(); Path file = fileSplit.getPath(); FileSystem fs = file.getFileSystem(conf); this.in = fs.open(file); XSSFWorkbook workbook = new XSSFWorkbook(this.in); XSSFSheet sheet = workbook.getSheetAt(0); this.totalRows = sheet.getPhysicalNumberOfRows(); this.processedRows = 0; this.rowIterator = sheet.rowIterator(); }
From source file:com.cloudera.science.avro.common.SchemaLoader.java
License:Open Source License
public Schema loadFromUrl(String schemaUrl) throws IOException { if (schemaUrl.toLowerCase().startsWith("hdfs://")) { FileSystem fs = FileSystem.get(conf); FSDataInputStream input = null;// ww w . j a v a 2 s .c om try { input = fs.open(new Path(schemaUrl)); return parser.parse(input); } finally { if (input != null) { input.close(); } } } else { InputStream is = null; try { is = new URL(schemaUrl).openStream(); return parser.parse(is); } finally { if (is != null) { is.close(); } } } }
From source file:com.cloudera.sqoop.TestAllTables.java
License:Apache License
public void testMultiTableImport() throws IOException { String[] argv = getArgv(true, null); runImport(new ImportAllTablesTool(), argv); Path warehousePath = new Path(this.getWarehouseDir()); int i = 0;/*from www.j av a 2s.c o m*/ for (String tableName : this.tableNames) { Path tablePath = new Path(warehousePath, tableName); Path filePath = new Path(tablePath, "part-m-00000"); // dequeue the expected value for this table. This // list has the same order as the tableNames list. String expectedVal = Integer.toString(i++) + "," + this.expectedStrings.get(0); this.expectedStrings.remove(0); BufferedReader reader = null; if (!isOnPhysicalCluster()) { reader = new BufferedReader( new InputStreamReader(new FileInputStream(new File(filePath.toString())))); } else { FileSystem dfs = FileSystem.get(getConf()); FSDataInputStream dis = dfs.open(filePath); reader = new BufferedReader(new InputStreamReader(dis)); } try { String line = reader.readLine(); assertEquals("Table " + tableName + " expected a different string", expectedVal, line); } finally { IOUtils.closeStream(reader); } } }