Example usage for org.apache.hadoop.fs FileSystem open

List of usage examples for org.apache.hadoop.fs FileSystem open

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem open.

Prototype

public FSDataInputStream open(PathHandle fd) throws IOException 

Source Link

Document

Open an FSDataInputStream matching the PathHandle instance.

Usage

From source file:com.cloudera.hive.scd.SQLUpdater.java

License:Open Source License

private List<String> readLines(FileSystem fs, Path path, long rootScdTime) throws IOException {
    List<String> lines = Lists.newArrayList();
    long currentScdTime = 0L;
    StringBuilder workingLine = null;
    for (String line : CharStreams.readLines(new InputStreamReader(fs.open(path)))) {
        if (line.toLowerCase(Locale.ENGLISH).startsWith(TIME_PREFIX)) {
            currentScdTime = asSCDTime(line.substring(TIME_PREFIX.length()), rootScdTime);
        } else if (currentScdTime <= rootScdTime) {
            // Prune out comments/whitspace
            line = line.trim();/*from  w  w  w  .  j a  v  a2 s  .co m*/
            int commentIndex = line.indexOf("--");
            if (commentIndex >= 0) {
                line = line.substring(0, commentIndex);
            }
            if (!line.isEmpty()) {
                if (!line.endsWith(";")) {
                    if (workingLine == null) {
                        workingLine = new StringBuilder();
                    }
                    workingLine.append(line).append(' ');
                } else {
                    if (workingLine != null) {
                        workingLine.append(line);
                        lines.add(workingLine.toString());
                        workingLine = null; // working line is completed.
                    } else { // single-line statement
                        lines.add(line);
                    }
                }
            }
        }
    }
    if (workingLine != null) {
        throw new IllegalStateException("Incomplete SQL in updates: " + workingLine.toString());
    }
    return lines;
}

From source file:com.cloudera.hoop.client.fs.TestHoopFileSystem.java

License:Open Source License

private void testOpen() throws Exception {
    FileSystem fs = FileSystem.get(getHadoopConf());
    Path path = new Path(getHadoopTestDir(), "foo.txt");
    OutputStream os = fs.create(path);
    os.write(1);//from   w w  w.ja v  a 2  s  .  com
    os.close();
    fs.close();
    Configuration conf = new Configuration();
    conf.set("fs.http.impl", HoopFileSystem.class.getName());
    fs = FileSystem.get(getJettyURL().toURI(), conf);
    InputStream is = fs.open(new Path(path.toUri().getPath()));
    Assert.assertEquals(is.read(), 1);
    is.close();
    fs.close();
}

From source file:com.cloudera.hoop.client.fs.TestHoopFileSystem.java

License:Open Source License

private void testCreate(Path path, boolean override) throws Exception {
    Configuration conf = new Configuration();
    conf.set("fs.http.impl", HoopFileSystem.class.getName());
    FileSystem fs = FileSystem.get(getJettyURL().toURI(), conf);
    FsPermission permission = new FsPermission(FsAction.READ_WRITE, FsAction.NONE, FsAction.NONE);
    OutputStream os = fs.create(new Path(path.toUri().getPath()), permission, override, 1024, (short) 2,
            100 * 1024 * 1024, null);//from  www  . j ava 2  s. c  om
    os.write(1);
    os.close();
    fs.close();

    fs = FileSystem.get(getHadoopConf());
    FileStatus status = fs.getFileStatus(path);
    Assert.assertEquals(status.getReplication(), 2);
    Assert.assertEquals(status.getBlockSize(), 100 * 1024 * 1024);
    Assert.assertEquals(status.getPermission(), permission);
    InputStream is = fs.open(path);
    Assert.assertEquals(is.read(), 1);
    is.close();
    fs.close();
}

From source file:com.cloudera.hoop.client.fs.TestHoopFileSystem.java

License:Open Source License

private void testAppend() throws Exception {
    FileSystem fs = FileSystem.get(getHadoopConf());
    Path path = new Path(getHadoopTestDir(), "foo.txt");
    OutputStream os = fs.create(path);
    os.write(1);/*from   w  w  w  .  java 2 s  . c  o  m*/
    os.close();
    fs.close();
    Configuration conf = new Configuration();
    conf.set("fs.http.impl", HoopFileSystem.class.getName());
    fs = FileSystem.get(getJettyURL().toURI(), conf);
    os = fs.append(new Path(path.toUri().getPath()));
    os.write(2);
    os.close();
    fs.close();
    fs = FileSystem.get(getHadoopConf());
    InputStream is = fs.open(path);
    Assert.assertEquals(is.read(), 1);
    Assert.assertEquals(is.read(), 2);
    Assert.assertEquals(is.read(), -1);
    is.close();
    fs.close();
}

From source file:com.cloudera.impala.common.FileSystemUtil.java

License:Apache License

/**
 * Reads the file at path and returns the contents.
 *///  w w  w  . j  ava2s.  c o m
public static String readFile(Path file) throws IOException {
    FileSystem fs = file.getFileSystem(CONF);
    InputStream fileStream = fs.open(file);
    try {
        return IOUtils.toString(fileStream);
    } finally {
        IOUtils.closeQuietly(fileStream);
    }
}

From source file:com.cloudera.oryx.app.pmml.AppPMMLUtils.java

License:Open Source License

public static PMML readPMMLFromUpdateKeyMessage(String key, String message, Configuration hadoopConf)
        throws IOException {
    String pmmlString;/*from w  ww  .  j  ava  2s.  c o m*/
    switch (key) {
    case "MODEL":
        pmmlString = message;
        break;
    case "MODEL-REF":
        // Allowing null is mostly for integration tests
        if (hadoopConf == null) {
            hadoopConf = new Configuration();
        }
        Path messagePath = new Path(message);
        FileSystem fs = FileSystem.get(messagePath.toUri(), hadoopConf);
        try (InputStreamReader in = new InputStreamReader(fs.open(messagePath), StandardCharsets.UTF_8)) {
            pmmlString = CharStreams.toString(in);
        } catch (FileNotFoundException fnfe) {
            log.warn("Unable to load model file at {}; ignoring", messagePath);
            return null;
        }
        break;
    default:
        throw new IllegalArgumentException("Unknown key " + key);
    }
    return PMMLUtils.fromString(pmmlString);
}

From source file:com.cloudera.oryx.ml.MLUpdate.java

License:Open Source License

@Override
public void runUpdate(JavaSparkContext sparkContext, long timestamp, JavaPairRDD<Object, M> newKeyMessageData,
        JavaPairRDD<Object, M> pastKeyMessageData, String modelDirString,
        TopicProducer<String, String> modelUpdateTopic) throws IOException, InterruptedException {

    Objects.requireNonNull(newKeyMessageData);

    JavaRDD<M> newData = newKeyMessageData.values();
    JavaRDD<M> pastData = pastKeyMessageData == null ? null : pastKeyMessageData.values();

    if (newData != null) {
        newData.cache();// w  ww. j  a v  a 2s .  c  o m
        // This forces caching of the RDD. This shouldn't be necessary but we see some freezes
        // when many workers try to materialize the RDDs at once. Hence the workaround.
        newData.foreachPartition(p -> {
        });
    }
    if (pastData != null) {
        pastData.cache();
        pastData.foreachPartition(p -> {
        });
    }

    List<HyperParamValues<?>> hyperParamValues = getHyperParameterValues();
    int valuesPerHyperParam = HyperParams.chooseValuesPerHyperParam(hyperParamValues.size(), candidates);
    List<List<?>> hyperParameterCombos = HyperParams.chooseHyperParameterCombos(hyperParamValues, candidates,
            valuesPerHyperParam);

    Path modelDir = new Path(modelDirString);
    Path tempModelPath = new Path(modelDir, ".temporary");
    Path candidatesPath = new Path(tempModelPath, Long.toString(System.currentTimeMillis()));

    FileSystem fs = FileSystem.get(modelDir.toUri(), sparkContext.hadoopConfiguration());
    fs.mkdirs(candidatesPath);

    Path bestCandidatePath = findBestCandidatePath(sparkContext, newData, pastData, hyperParameterCombos,
            candidatesPath);

    Path finalPath = new Path(modelDir, Long.toString(System.currentTimeMillis()));
    if (bestCandidatePath == null) {
        log.info("Unable to build any model");
    } else {
        // Move best model into place
        fs.rename(bestCandidatePath, finalPath);
    }
    // Then delete everything else
    fs.delete(candidatesPath, true);

    if (modelUpdateTopic == null) {
        log.info("No update topic configured, not publishing models to a topic");
    } else {
        // Push PMML model onto update topic, if it exists
        Path bestModelPath = new Path(finalPath, MODEL_FILE_NAME);
        if (fs.exists(bestModelPath)) {
            FileStatus bestModelPathFS = fs.getFileStatus(bestModelPath);
            PMML bestModel = null;
            boolean modelNeededForUpdates = canPublishAdditionalModelData();
            boolean modelNotTooLarge = bestModelPathFS.getLen() <= maxMessageSize;
            if (modelNeededForUpdates || modelNotTooLarge) {
                // Either the model is required for publishAdditionalModelData, or required because it's going to
                // be serialized to Kafka
                try (InputStream in = fs.open(bestModelPath)) {
                    bestModel = PMMLUtils.read(in);
                }
            }

            if (modelNotTooLarge) {
                modelUpdateTopic.send("MODEL", PMMLUtils.toString(bestModel));
            } else {
                modelUpdateTopic.send("MODEL-REF", fs.makeQualified(bestModelPath).toString());
            }

            if (modelNeededForUpdates) {
                publishAdditionalModelData(sparkContext, bestModel, newData, pastData, finalPath,
                        modelUpdateTopic);
            }
        }
    }

    if (newData != null) {
        newData.unpersist();
    }
    if (pastData != null) {
        pastData.unpersist();
    }
}

From source file:com.cloudera.recordbreaker.analyzer.CSVDataDescriptor.java

License:Open Source License

/**
 * Test whether a given file is amenable to CSV processing
 */// www . j  a v a  2  s  .  c  o m
public static boolean isCSV(FileSystem fs, Path p) {
    String fname = p.getName();
    if (fname.endsWith(".csv")) {
        return true;
    }
    CSVParser parser = new CSVParser();
    try {
        BufferedReader in = new BufferedReader(new InputStreamReader(fs.open(p)));
        try {
            int lineCount = 0;
            List<Integer> observedEltCounts = new ArrayList<Integer>();
            int totalEltCount = 0;
            int minEltCount = Integer.MAX_VALUE;
            int maxEltCount = -1;

            String line = null;
            while (lineCount < MAX_LINES && ((line = in.readLine()) != null)) {
                String parts[] = parser.parseLine(line);
                int numElts = parts.length;
                minEltCount = Math.min(minEltCount, numElts);
                maxEltCount = Math.max(maxEltCount, numElts);
                totalEltCount += numElts;
                observedEltCounts.add(numElts);

                lineCount++;
            }
            double meanEltCount = totalEltCount / (1.0 * observedEltCounts.size());
            double totalVariance = 0;
            for (Integer v : observedEltCounts) {
                totalVariance += Math.pow(v - meanEltCount, 2);
            }
            double variance = totalVariance / observedEltCounts.size();
            double stddev = Math.sqrt(variance);
            if (lineCount >= MIN_LINE_COUNT && meanEltCount >= MIN_MEAN_ELTS
                    && ((stddev / meanEltCount) < MAX_ALLOWABLE_LINE_STDDEV)) {
                return true;
            }
        } finally {
            in.close();
        }
    } catch (IOException ie) {
    }
    return false;
}

From source file:com.cloudera.recordbreaker.analyzer.CSVDataDescriptor.java

License:Open Source License

public void prepareAvroFile(FileSystem srcFs, FileSystem dstFs, Path dst, Configuration conf)
        throws IOException {
    // THIS IS WHERE THE MAGIC HAPPENS!!!
    // Convert CSV into Avro!!!!
    SchemaDescriptor sd = this.getSchemaDescriptor().get(0);
    List<Schema> unionFreeSchemas = SchemaUtils.getUnionFreeSchemasByFrequency(sd, 100, true);
    Schema schema = unionFreeSchemas.get(0);

    String headerRowHash = new String(sd.getPayload());
    CSVRowParser rowParser = new CSVRowParser(schema, headerRowHash);

    // Open stream to write out Avro contents
    DatumWriter<GenericRecord> writer = new GenericDatumWriter<GenericRecord>(schema);
    DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(writer);
    dataFileWriter.create(schema, dstFs.create(dst, true));
    int numRecords = 0;
    int MAX_RECORDS = 1000;
    try {//  ww  w.j  a v a2 s.  c  o m
        BufferedReader in = new BufferedReader(new InputStreamReader(srcFs.open(getFilename())));
        try {
            String rowStr = null;
            while (((rowStr = in.readLine()) != null) && (numRecords < MAX_RECORDS)) {
                if (("" + rowStr.hashCode()).compareTo(headerRowHash) == 0) {
                    continue;
                }
                GenericData.Record record = rowParser.parseRow(rowStr);
                if (record == null) {
                    continue;
                }
                if (record.getSchema().toString().hashCode() != schema.toString().hashCode()) {
                    continue;
                }
                dataFileWriter.append(record);
                numRecords++;
            }
        } finally {
            in.close();
        }
    } finally {
        dataFileWriter.close();
    }
}

From source file:com.cloudera.recordbreaker.analyzer.TextRegexpDataDescriptor.java

License:Open Source License

public static boolean isTextRegexpFile(FileSystem fs, Path p, List<Pattern> regexps) throws IOException {
    int totalCounts = 0;
    int matchCounts[] = new int[regexps.size()];

    BufferedReader in = new BufferedReader(new InputStreamReader(fs.open(p)));
    try {/*  www  .  j a va 2 s  .  c  o m*/
        String cur = null;
        while ((cur = in.readLine()) != null) {
            for (int i = 0; i < regexps.size(); i++) {
                Pattern patt = regexps.get(i);
                Matcher m = patt.matcher(cur);
                if (m.find()) {
                    matchCounts[i]++;
                }
            }
            totalCounts++;
            if (MAX_LINES >= 0 && totalCounts >= MAX_LINES) {
                break;
            }
        }
    } finally {
        in.close();
    }

    for (int i = 0; i < matchCounts.length; i++) {
        if (((1.0 * matchCounts[i]) / totalCounts) > 0.3) {
            return true;
        }
    }
    return false;
}