Example usage for org.apache.hadoop.fs FileSystem open

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem open.

Prototype

public FSDataInputStream open(PathHandle fd) throws IOException

Source Link

Document

Open an FSDataInputStream matching the PathHandle instance.

Usage

From source file:com.cloudera.recordbreaker.analyzer.TextRegexpDataDescriptor.java

License:Open Source License

public void prepareAvroFile(FileSystem srcFs, FileSystem dstFs, Path dst, Configuration conf)
        throws IOException {
    SchemaDescriptor sd = this.getSchemaDescriptor().get(0);
    List<Schema> unionFreeSchemas = SchemaUtils.getUnionFreeSchemasByFrequency(sd, 100, true);
    Schema schema = unionFreeSchemas.get(0);

    // Open stream to write out Avro contents
    DatumWriter<GenericRecord> writer = new GenericDatumWriter<GenericRecord>(schema);
    DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(writer);
    dataFileWriter.create(schema, dstFs.create(dst, true));
    try {// w w  w  .j  a  v  a 2s . c om
        BufferedReader in = new BufferedReader(new InputStreamReader(srcFs.open(getFilename())));
        try {
            String rowStr = null;
            GenericData.Record rowRecord = null;
            while ((rowStr = in.readLine()) != null) {
                for (int i = 0; i < regexps.size(); i++) {
                    Pattern curPattern = regexps.get(i);
                    Schema curSchema = localschemas.get(i);
                    Matcher curMatcher = curPattern.matcher(rowStr);

                    if (curMatcher.find()) {
                        // Create Avro record here
                        rowRecord = new GenericData.Record(curSchema);
                        List<Schema.Field> curFields = curSchema.getFields();

                        for (int j = 0; j < curMatcher.groupCount(); j++) {
                            Schema.Field curField = curFields.get(j);

                            String fieldName = curField.name();
                            Schema fieldType = curField.schema();
                            String rawFieldValue = curMatcher.group(j + 1);

                            Object fieldValue = null;
                            if (fieldType.getType() == Schema.Type.INT) {
                                fieldValue = Integer.parseInt(rawFieldValue);
                            } else if (fieldType.getType() == Schema.Type.FLOAT) {
                                fieldValue = Float.parseFloat(rawFieldValue);
                            } else if (fieldType.getType() == Schema.Type.STRING) {
                                fieldValue = rawFieldValue;
                            }
                            if (fieldValue != null) {
                                rowRecord.put(fieldName, fieldValue);
                            }
                        }
                        if (rowRecord.getSchema().toString().hashCode() == schema.toString().hashCode()) {
                            dataFileWriter.append(rowRecord);
                        }
                    }
                }
            }
        } finally {
            in.close();
        }
    } finally {
        dataFileWriter.close();
    }
}

From source file:com.cloudera.recordbreaker.analyzer.UnknownTextDataDescriptor.java

License:Open Source License

public static boolean isTextData(FileSystem fs, Path p) {
    try {//  w w w  .j  a  va2s.  c o  m
        BufferedInputStream in = new BufferedInputStream(fs.open(p));
        try {
            byte buf[] = new byte[1024];
            int numBytes = in.read(buf);
            if (numBytes < 0) {
                return false;
            }
            int numASCIIChars = 0;
            for (int i = 0; i < numBytes; i++) {
                if (buf[i] >= 32 && buf[i] < 128) {
                    numASCIIChars++;
                }
            }
            return ((numASCIIChars / (1.0 * numBytes)) > asciiThreshold);
        } finally {
            in.close();
        }
    } catch (IOException iex) {
        return false;
    }
}

From source file:com.cloudera.recordbreaker.analyzer.UnknownTextDataDescriptor.java

License:Open Source License

public UnknownTextDataDescriptor(FileSystem fs, Path p, File schemaDictDir) throws IOException {
    super(p, fs, TEXTDATA_TYPE);

    this.schemaDictDir = schemaDictDir;
    UnknownTextSchemaDescriptor tsd = new UnknownTextSchemaDescriptor(this);

    // Test if this schema descriptor can parse the file
    boolean hasLatentStructure = false;
    int numTuples = 0;
    for (Iterator it = tsd.getIterator(); it.hasNext();) {
        numTuples++;/* ww  w .  ja va2s .  c  o m*/
        it.next();
    }
    int numLines = 0;
    BufferedReader in = new BufferedReader(new InputStreamReader(fs.open(p)));
    while (in.readLine() != null) {
        numLines++;
    }

    numTuples = Math.min(numTuples, UnknownTextSchemaDescriptor.MAX_LINES);
    numLines = Math.min(numLines, UnknownTextSchemaDescriptor.MAX_LINES);
    if ((numTuples / (1.0 * numLines)) < TUPLE_PCT) {
        throw new IOException("Cannot parse structured text data");
    }
    this.schemas.add(new UnknownTextSchemaDescriptor(this));
}

From source file:com.cloudera.recordbreaker.analyzer.UnknownTextSchemaDescriptor.java

License:Open Source License

void computeSchema() throws IOException {
    this.randId = new Random().nextInt();
    LearnStructure ls = new LearnStructure();
    FileSystem fs = FSAnalyzer.getInstance().getFS();
    FileSystem localFS = FileSystem.getLocal(new Configuration());
    Path inputPath = dd.getFilename();

    File workingParserFile = File.createTempFile("textdesc", "typetree", null);
    File workingSchemaFile = File.createTempFile("textdesc", "schema", null);

    ls.inferRecordFormat(fs, inputPath, localFS, new Path(workingSchemaFile.getCanonicalPath()),
            new Path(workingParserFile.getCanonicalPath()), null, null, false, MAX_LINES);

    this.schema = Schema.parse(workingSchemaFile);
    DataInputStream in = new DataInputStream(localFS.open(new Path(workingParserFile.getCanonicalPath())));
    try {//from  ww  w.j a  v a 2 s.  c  o  m
        this.typeTree = InferredType.readType(in);
    } catch (IOException iex) {
        iex.printStackTrace();
        throw iex;
    } finally {
        in.close();
    }
    //System.err.println("Recovered unknowntext schema: " + schema);
}

From source file:com.cloudera.recordbreaker.hive.borrowed.AvroSerdeUtils.java

License:Apache License

protected static Schema getSchemaFromHDFS(String schemaHDFSUrl, Configuration conf) throws IOException {
    FileSystem fs = FileSystem.get(conf);
    FSDataInputStream in = null;//from   w  ww  . ja v a2s .  co m

    try {
        in = fs.open(new Path(schemaHDFSUrl));
        Schema s = Schema.parse(in);
        return s;
    } finally {
        if (in != null)
            in.close();
    }
}

From source file:com.cloudera.recordbreaker.learnstructure.LearnStructure.java

License:Open Source License

/**
 *//* w  w w.j  a v  a2 s  .  c  o m*/
public void inferRecordFormat(FileSystem fs, Path p, FileSystem fs2, Path schemaFile, Path parseTreeFile,
        Path jsonDataFile, Path avroDataFile, boolean verbose, int maxLines) throws IOException {
    // Store parse errors and results
    List<Integer> unparseableLineNos = new ArrayList<Integer>();
    List<String> unparseableStrs = new ArrayList<String>();
    List<Integer> parseableLineNos = new ArrayList<Integer>();
    List<List<Token.AbstractToken>> allChunks = new ArrayList<List<Token.AbstractToken>>();

    //
    // Transform the text into a list of "chunks".  A single chunk corresponds to a line of text.  A chunk is a list of Tokens.
    //
    long startRead = System.currentTimeMillis();
    BufferedReader in = new BufferedReader(new InputStreamReader(fs.open(p)));
    try {
        String s = in.readLine();
        int lineno = 0;
        while (s != null) {
            if (maxLines >= 0 && lineno >= maxLines) {
                break;
            }
            List<Token.AbstractToken> chunkToks = Tokenizer.tokenize(s);
            if (chunkToks != null) {
                allChunks.add(chunkToks);
                parseableLineNos.add(lineno);
            } else {
                unparseableStrs.add(s);
                unparseableLineNos.add(lineno);
            }
            s = in.readLine();
            lineno++;
        }
    } finally {
        in.close();
    }

    //
    // Infer type structure from the tokenized chunks
    //
    long start = System.currentTimeMillis();
    InferredType typeTree = TypeInference.infer(allChunks);
    long end = System.currentTimeMillis();
    double loadTime = (start - startRead) / 1000.0;
    double inferTime = (end - start) / 1000.0;
    double totalTime = (end - startRead) / 1000.0;
    if (verbose) {
        System.err.println("Number of chunks: " + allChunks.size());
        System.err.println("Elapsed load time: " + loadTime);
        System.err.println("Elapsed inference time: " + inferTime);
        System.err.println("Total execution time: " + totalTime);
    }

    //
    // The existing type tree is now correct, but could probably be more succinct.
    // We can now improve/rewrite it.
    //

    //
    // Should every top-level type be ARRAY, so as to allow repeated log lines?
    // Or does the Avro format allow an implict top-level repeating structure?
    //

    //
    // Dump the results.  We emit:
    // 1) A JSON/Avro schema
    // 2) A serialized parser program that can consume data and emit Avro files using the given schema
    //
    Schema s = typeTree.getAvroSchema();
    if (schemaFile != null) {
        BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fs2.create(schemaFile)));
        try {
            out.write(s.toString(true));
        } finally {
            out.close();
        }
    }
    if (parseTreeFile != null) {
        DataOutputStream outd = new DataOutputStream(new BufferedOutputStream(fs2.create(parseTreeFile)));
        try {
            typeTree.write(outd);
        } finally {
            outd.close();
        }
    }

    //
    // Apply the typetree's parser.
    //
    if (jsonDataFile != null) {
        Schema schema = typeTree.getAvroSchema();
        GenericDatumWriter jsonGDWriter = new GenericDatumWriter(schema);
        BufferedOutputStream outJson = new BufferedOutputStream(fs2.create(jsonDataFile));
        JsonEncoder encoder = EncoderFactory.get().jsonEncoder(schema, outJson);
        try {
            in = new BufferedReader(new InputStreamReader(fs.open(p)));
            try {
                String str = in.readLine();
                while (str != null) {
                    GenericContainer gct = typeTree.parse(str);

                    if (gct != null) {
                        jsonGDWriter.write(gct, encoder);
                    }
                    str = in.readLine();
                }
            } finally {
                in.close();
            }
        } finally {
            encoder.flush();
            outJson.close();
        }
    }

    if (avroDataFile != null) {
        int numGoodParses = 0;
        int lineno = 0;
        Schema schema = typeTree.getAvroSchema();

        GenericDatumWriter gdWriter = new GenericDatumWriter(schema);
        DataFileWriter outData = new DataFileWriter(gdWriter);
        outData = outData.create(schema, fs2.create(avroDataFile));

        try {
            in = new BufferedReader(new InputStreamReader(fs.open(p)));
            try {
                String str = in.readLine();
                while (str != null) {
                    GenericContainer gct = typeTree.parse(str);
                    if (gct != null) {
                        numGoodParses++;
                        outData.append(gct);
                    } else {
                        if (verbose) {
                            System.err.println("unparsed line: '" + str + "'");
                        }
                    }
                    str = in.readLine();
                    lineno++;
                }
            } finally {
                in.close();
            }
        } finally {
            outData.close();
        }
        if (verbose) {
            System.err.println();
            System.err.println("Total # input lines: " + lineno);
            System.err.println("Total # lines parsed correctly: " + numGoodParses);
        }
    }
}

From source file:com.cloudera.recordservice.mapreduce.testapps.RecordCount.java

License:Apache License

public static long countRecords(String path) throws IOException {
    String output = TestUtil.getTempDirectory();
    Path inputPath = new Path(path);
    Path outputPath = new Path(output);

    JobConf conf = new JobConf(RecordCount.class);
    conf.setJobName("recordcount");

    conf.setOutputKeyClass(NullWritable.class);
    conf.setOutputValueClass(LongWritable.class);

    conf.setInt("mapreduce.job.reduces", 1);
    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    conf.setInputFormat(com.cloudera.recordservice.mapred.TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, outputPath);

    JobClient.runJob(conf);/*  w w w .j a  v a2s  .  c o m*/

    // Read the result and return it. Since we set the number of reducers to 1,
    // there is always just one file containing the value.
    FileSystem fs = outputPath.getFileSystem(conf);
    FSDataInputStream resultStream = fs.open(new Path(output + "/part-00000"));
    byte[] bytes = new byte[16];
    int length = resultStream.read(bytes);
    String result = new String(bytes, 0, length).trim();
    return Long.parseLong(result);
}

From source file:com.cloudera.sa.ExcelRecordReader.java

License:Apache License

@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
    FileSplit fileSplit = (FileSplit) split;
    Configuration conf = context.getConfiguration();
    Path file = fileSplit.getPath();
    FileSystem fs = file.getFileSystem(conf);
    this.in = fs.open(file);
    XSSFWorkbook workbook = new XSSFWorkbook(this.in);
    XSSFSheet sheet = workbook.getSheetAt(0);
    this.totalRows = sheet.getPhysicalNumberOfRows();
    this.processedRows = 0;
    this.rowIterator = sheet.rowIterator();
}

From source file:com.cloudera.science.avro.common.SchemaLoader.java

License:Open Source License

public Schema loadFromUrl(String schemaUrl) throws IOException {
    if (schemaUrl.toLowerCase().startsWith("hdfs://")) {
        FileSystem fs = FileSystem.get(conf);
        FSDataInputStream input = null;// ww w  .  j a  v  a 2 s .c  om
        try {
            input = fs.open(new Path(schemaUrl));
            return parser.parse(input);
        } finally {
            if (input != null) {
                input.close();
            }
        }
    } else {
        InputStream is = null;
        try {
            is = new URL(schemaUrl).openStream();
            return parser.parse(is);
        } finally {
            if (is != null) {
                is.close();
            }
        }
    }
}

From source file:com.cloudera.sqoop.TestAllTables.java

License:Apache License

public void testMultiTableImport() throws IOException {
    String[] argv = getArgv(true, null);
    runImport(new ImportAllTablesTool(), argv);

    Path warehousePath = new Path(this.getWarehouseDir());
    int i = 0;/*from   www.j  av a  2s.c  o  m*/
    for (String tableName : this.tableNames) {
        Path tablePath = new Path(warehousePath, tableName);
        Path filePath = new Path(tablePath, "part-m-00000");

        // dequeue the expected value for this table. This
        // list has the same order as the tableNames list.
        String expectedVal = Integer.toString(i++) + "," + this.expectedStrings.get(0);
        this.expectedStrings.remove(0);

        BufferedReader reader = null;
        if (!isOnPhysicalCluster()) {
            reader = new BufferedReader(
                    new InputStreamReader(new FileInputStream(new File(filePath.toString()))));
        } else {
            FileSystem dfs = FileSystem.get(getConf());
            FSDataInputStream dis = dfs.open(filePath);
            reader = new BufferedReader(new InputStreamReader(dis));
        }
        try {
            String line = reader.readLine();
            assertEquals("Table " + tableName + " expected a different string", expectedVal, line);
        } finally {
            IOUtils.closeStream(reader);
        }
    }
}