List of usage examples for org.apache.hadoop.fs FileSystem open
public FSDataInputStream open(PathHandle fd) throws IOException
From source file:com.cloudera.hive.scd.SQLUpdater.java
License:Open Source License
private List<String> readLines(FileSystem fs, Path path, long rootScdTime) throws IOException { List<String> lines = Lists.newArrayList(); long currentScdTime = 0L; StringBuilder workingLine = null; for (String line : CharStreams.readLines(new InputStreamReader(fs.open(path)))) { if (line.toLowerCase(Locale.ENGLISH).startsWith(TIME_PREFIX)) { currentScdTime = asSCDTime(line.substring(TIME_PREFIX.length()), rootScdTime); } else if (currentScdTime <= rootScdTime) { // Prune out comments/whitspace line = line.trim();/*from w w w . j a v a2 s .co m*/ int commentIndex = line.indexOf("--"); if (commentIndex >= 0) { line = line.substring(0, commentIndex); } if (!line.isEmpty()) { if (!line.endsWith(";")) { if (workingLine == null) { workingLine = new StringBuilder(); } workingLine.append(line).append(' '); } else { if (workingLine != null) { workingLine.append(line); lines.add(workingLine.toString()); workingLine = null; // working line is completed. } else { // single-line statement lines.add(line); } } } } } if (workingLine != null) { throw new IllegalStateException("Incomplete SQL in updates: " + workingLine.toString()); } return lines; }
From source file:com.cloudera.hoop.client.fs.TestHoopFileSystem.java
License:Open Source License
private void testOpen() throws Exception { FileSystem fs = FileSystem.get(getHadoopConf()); Path path = new Path(getHadoopTestDir(), "foo.txt"); OutputStream os = fs.create(path); os.write(1);//from w w w.ja v a 2 s . com os.close(); fs.close(); Configuration conf = new Configuration(); conf.set("fs.http.impl", HoopFileSystem.class.getName()); fs = FileSystem.get(getJettyURL().toURI(), conf); InputStream is = fs.open(new Path(path.toUri().getPath())); Assert.assertEquals(is.read(), 1); is.close(); fs.close(); }
From source file:com.cloudera.hoop.client.fs.TestHoopFileSystem.java
License:Open Source License
private void testCreate(Path path, boolean override) throws Exception { Configuration conf = new Configuration(); conf.set("fs.http.impl", HoopFileSystem.class.getName()); FileSystem fs = FileSystem.get(getJettyURL().toURI(), conf); FsPermission permission = new FsPermission(FsAction.READ_WRITE, FsAction.NONE, FsAction.NONE); OutputStream os = fs.create(new Path(path.toUri().getPath()), permission, override, 1024, (short) 2, 100 * 1024 * 1024, null);//from www . j ava 2 s. c om os.write(1); os.close(); fs.close(); fs = FileSystem.get(getHadoopConf()); FileStatus status = fs.getFileStatus(path); Assert.assertEquals(status.getReplication(), 2); Assert.assertEquals(status.getBlockSize(), 100 * 1024 * 1024); Assert.assertEquals(status.getPermission(), permission); InputStream is = fs.open(path); Assert.assertEquals(is.read(), 1); is.close(); fs.close(); }
From source file:com.cloudera.hoop.client.fs.TestHoopFileSystem.java
License:Open Source License
private void testAppend() throws Exception { FileSystem fs = FileSystem.get(getHadoopConf()); Path path = new Path(getHadoopTestDir(), "foo.txt"); OutputStream os = fs.create(path); os.write(1);/*from w w w . java 2 s . c o m*/ os.close(); fs.close(); Configuration conf = new Configuration(); conf.set("fs.http.impl", HoopFileSystem.class.getName()); fs = FileSystem.get(getJettyURL().toURI(), conf); os = fs.append(new Path(path.toUri().getPath())); os.write(2); os.close(); fs.close(); fs = FileSystem.get(getHadoopConf()); InputStream is = fs.open(path); Assert.assertEquals(is.read(), 1); Assert.assertEquals(is.read(), 2); Assert.assertEquals(is.read(), -1); is.close(); fs.close(); }
From source file:com.cloudera.impala.common.FileSystemUtil.java
License:Apache License
/** * Reads the file at path and returns the contents. */// w w w . j ava2s. c o m public static String readFile(Path file) throws IOException { FileSystem fs = file.getFileSystem(CONF); InputStream fileStream = fs.open(file); try { return IOUtils.toString(fileStream); } finally { IOUtils.closeQuietly(fileStream); } }
From source file:com.cloudera.oryx.app.pmml.AppPMMLUtils.java
License:Open Source License
public static PMML readPMMLFromUpdateKeyMessage(String key, String message, Configuration hadoopConf) throws IOException { String pmmlString;/*from w ww . j ava 2s. c o m*/ switch (key) { case "MODEL": pmmlString = message; break; case "MODEL-REF": // Allowing null is mostly for integration tests if (hadoopConf == null) { hadoopConf = new Configuration(); } Path messagePath = new Path(message); FileSystem fs = FileSystem.get(messagePath.toUri(), hadoopConf); try (InputStreamReader in = new InputStreamReader(fs.open(messagePath), StandardCharsets.UTF_8)) { pmmlString = CharStreams.toString(in); } catch (FileNotFoundException fnfe) { log.warn("Unable to load model file at {}; ignoring", messagePath); return null; } break; default: throw new IllegalArgumentException("Unknown key " + key); } return PMMLUtils.fromString(pmmlString); }
From source file:com.cloudera.oryx.ml.MLUpdate.java
License:Open Source License
@Override public void runUpdate(JavaSparkContext sparkContext, long timestamp, JavaPairRDD<Object, M> newKeyMessageData, JavaPairRDD<Object, M> pastKeyMessageData, String modelDirString, TopicProducer<String, String> modelUpdateTopic) throws IOException, InterruptedException { Objects.requireNonNull(newKeyMessageData); JavaRDD<M> newData = newKeyMessageData.values(); JavaRDD<M> pastData = pastKeyMessageData == null ? null : pastKeyMessageData.values(); if (newData != null) { newData.cache();// w ww. j a v a 2s . c o m // This forces caching of the RDD. This shouldn't be necessary but we see some freezes // when many workers try to materialize the RDDs at once. Hence the workaround. newData.foreachPartition(p -> { }); } if (pastData != null) { pastData.cache(); pastData.foreachPartition(p -> { }); } List<HyperParamValues<?>> hyperParamValues = getHyperParameterValues(); int valuesPerHyperParam = HyperParams.chooseValuesPerHyperParam(hyperParamValues.size(), candidates); List<List<?>> hyperParameterCombos = HyperParams.chooseHyperParameterCombos(hyperParamValues, candidates, valuesPerHyperParam); Path modelDir = new Path(modelDirString); Path tempModelPath = new Path(modelDir, ".temporary"); Path candidatesPath = new Path(tempModelPath, Long.toString(System.currentTimeMillis())); FileSystem fs = FileSystem.get(modelDir.toUri(), sparkContext.hadoopConfiguration()); fs.mkdirs(candidatesPath); Path bestCandidatePath = findBestCandidatePath(sparkContext, newData, pastData, hyperParameterCombos, candidatesPath); Path finalPath = new Path(modelDir, Long.toString(System.currentTimeMillis())); if (bestCandidatePath == null) { log.info("Unable to build any model"); } else { // Move best model into place fs.rename(bestCandidatePath, finalPath); } // Then delete everything else fs.delete(candidatesPath, true); if (modelUpdateTopic == null) { log.info("No update topic configured, not publishing models to a topic"); } else { // Push PMML model onto update topic, if it exists Path bestModelPath = new Path(finalPath, MODEL_FILE_NAME); if (fs.exists(bestModelPath)) { FileStatus bestModelPathFS = fs.getFileStatus(bestModelPath); PMML bestModel = null; boolean modelNeededForUpdates = canPublishAdditionalModelData(); boolean modelNotTooLarge = bestModelPathFS.getLen() <= maxMessageSize; if (modelNeededForUpdates || modelNotTooLarge) { // Either the model is required for publishAdditionalModelData, or required because it's going to // be serialized to Kafka try (InputStream in = fs.open(bestModelPath)) { bestModel = PMMLUtils.read(in); } } if (modelNotTooLarge) { modelUpdateTopic.send("MODEL", PMMLUtils.toString(bestModel)); } else { modelUpdateTopic.send("MODEL-REF", fs.makeQualified(bestModelPath).toString()); } if (modelNeededForUpdates) { publishAdditionalModelData(sparkContext, bestModel, newData, pastData, finalPath, modelUpdateTopic); } } } if (newData != null) { newData.unpersist(); } if (pastData != null) { pastData.unpersist(); } }
From source file:com.cloudera.recordbreaker.analyzer.CSVDataDescriptor.java
License:Open Source License
/** * Test whether a given file is amenable to CSV processing */// www . j a v a 2 s . c o m public static boolean isCSV(FileSystem fs, Path p) { String fname = p.getName(); if (fname.endsWith(".csv")) { return true; } CSVParser parser = new CSVParser(); try { BufferedReader in = new BufferedReader(new InputStreamReader(fs.open(p))); try { int lineCount = 0; List<Integer> observedEltCounts = new ArrayList<Integer>(); int totalEltCount = 0; int minEltCount = Integer.MAX_VALUE; int maxEltCount = -1; String line = null; while (lineCount < MAX_LINES && ((line = in.readLine()) != null)) { String parts[] = parser.parseLine(line); int numElts = parts.length; minEltCount = Math.min(minEltCount, numElts); maxEltCount = Math.max(maxEltCount, numElts); totalEltCount += numElts; observedEltCounts.add(numElts); lineCount++; } double meanEltCount = totalEltCount / (1.0 * observedEltCounts.size()); double totalVariance = 0; for (Integer v : observedEltCounts) { totalVariance += Math.pow(v - meanEltCount, 2); } double variance = totalVariance / observedEltCounts.size(); double stddev = Math.sqrt(variance); if (lineCount >= MIN_LINE_COUNT && meanEltCount >= MIN_MEAN_ELTS && ((stddev / meanEltCount) < MAX_ALLOWABLE_LINE_STDDEV)) { return true; } } finally { in.close(); } } catch (IOException ie) { } return false; }
From source file:com.cloudera.recordbreaker.analyzer.CSVDataDescriptor.java
License:Open Source License
public void prepareAvroFile(FileSystem srcFs, FileSystem dstFs, Path dst, Configuration conf) throws IOException { // THIS IS WHERE THE MAGIC HAPPENS!!! // Convert CSV into Avro!!!! SchemaDescriptor sd = this.getSchemaDescriptor().get(0); List<Schema> unionFreeSchemas = SchemaUtils.getUnionFreeSchemasByFrequency(sd, 100, true); Schema schema = unionFreeSchemas.get(0); String headerRowHash = new String(sd.getPayload()); CSVRowParser rowParser = new CSVRowParser(schema, headerRowHash); // Open stream to write out Avro contents DatumWriter<GenericRecord> writer = new GenericDatumWriter<GenericRecord>(schema); DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(writer); dataFileWriter.create(schema, dstFs.create(dst, true)); int numRecords = 0; int MAX_RECORDS = 1000; try {// ww w.j a v a2 s. c o m BufferedReader in = new BufferedReader(new InputStreamReader(srcFs.open(getFilename()))); try { String rowStr = null; while (((rowStr = in.readLine()) != null) && (numRecords < MAX_RECORDS)) { if (("" + rowStr.hashCode()).compareTo(headerRowHash) == 0) { continue; } GenericData.Record record = rowParser.parseRow(rowStr); if (record == null) { continue; } if (record.getSchema().toString().hashCode() != schema.toString().hashCode()) { continue; } dataFileWriter.append(record); numRecords++; } } finally { in.close(); } } finally { dataFileWriter.close(); } }
From source file:com.cloudera.recordbreaker.analyzer.TextRegexpDataDescriptor.java
License:Open Source License
public static boolean isTextRegexpFile(FileSystem fs, Path p, List<Pattern> regexps) throws IOException { int totalCounts = 0; int matchCounts[] = new int[regexps.size()]; BufferedReader in = new BufferedReader(new InputStreamReader(fs.open(p))); try {/* www . j a va 2 s . c o m*/ String cur = null; while ((cur = in.readLine()) != null) { for (int i = 0; i < regexps.size(); i++) { Pattern patt = regexps.get(i); Matcher m = patt.matcher(cur); if (m.find()) { matchCounts[i]++; } } totalCounts++; if (MAX_LINES >= 0 && totalCounts >= MAX_LINES) { break; } } } finally { in.close(); } for (int i = 0; i < matchCounts.length; i++) { if (((1.0 * matchCounts[i]) / totalCounts) > 0.3) { return true; } } return false; }