List of usage examples for org.apache.hadoop.fs FileSystem getContentSummary
public ContentSummary getContentSummary(Path f) throws IOException
From source file:org.apache.falcon.regression.core.util.HiveAssert.java
License:Apache License
/** * Assertion for equality of two tables. * @param cluster1 the ColoHelper of first cluster * @param table1 the first table (expected values) * @param cluster2 the ColoHelper of second cluster * @param table2 the second table (actual values) * @param softAssert object used for performing assertion * @return object used for performing assertion * @throws java.io.IOException//from w w w . j av a 2 s . c o m */ public static SoftAssert assertTableEqual(ColoHelper cluster1, HCatTable table1, ColoHelper cluster2, HCatTable table2, SoftAssert softAssert, boolean notIgnoreTblTypeAndProps) throws IOException { FileSystem cluster1FS = cluster1.getClusterHelper().getHadoopFS(); FileSystem cluster2FS = cluster2.getClusterHelper().getHadoopFS(); final String table1FullName = table1.getDbName() + "." + table1.getTableName(); final String table2FullName = table2.getDbName() + "." + table2.getTableName(); LOGGER.info("Checking equality of table : " + table1FullName + " & " + table2FullName); //table metadata equality softAssert.assertEquals(table2.comment(), table1.comment(), "Table " + table1FullName + " has different comment from " + table2FullName); softAssert.assertEquals(table2.getBucketCols(), table1.getBucketCols(), "Table " + table1FullName + " has different bucket columns from " + table2FullName); assertColumnListEqual(table1.getCols(), table2.getCols(), softAssert); softAssert.assertEquals(table2.getNumBuckets(), table1.getNumBuckets(), "Table " + table1FullName + " has different number of buckets from " + table2FullName); assertColumnListEqual(table1.getPartCols(), table2.getPartCols(), softAssert); softAssert.assertEquals(table2.getSerdeParams(), table1.getSerdeParams(), "Table " + table1FullName + " has different serde params from " + table2FullName); softAssert.assertEquals(table2.getSortCols(), table1.getSortCols(), "Table " + table1FullName + " has different sort columns from " + table2FullName); softAssert.assertEquals(table2.getStorageHandler(), table1.getStorageHandler(), "Table " + table1FullName + " has different storage handler from " + table2FullName); if (notIgnoreTblTypeAndProps) { softAssert.assertEquals(table2.getTabletype(), table1.getTabletype(), "Table " + table1FullName + " has different Tabletype from " + table2FullName); } final Map<String, String> tbl1Props = table1.getTblProps(); final Map<String, String> tbl2Props = table2.getTblProps(); final String[] ignoreTblProps = { "transient_lastDdlTime", "repl.last.id", "last_modified_by", "last_modified_time", "COLUMN_STATS_ACCURATE", }; for (String ignoreTblProp : ignoreTblProps) { tbl1Props.remove(ignoreTblProp); tbl2Props.remove(ignoreTblProp); } final String[] ignoreDefaultProps = { "numRows", "rawDataSize" }; for (String ignoreProp : ignoreDefaultProps) { if ("-1".equals(tbl1Props.get(ignoreProp))) { tbl1Props.remove(ignoreProp); } if ("-1".equals(tbl2Props.get(ignoreProp))) { tbl2Props.remove(ignoreProp); } } if (notIgnoreTblTypeAndProps) { softAssert.assertEquals(tbl2Props, tbl1Props, "Table " + table1FullName + " has different TblProps from " + table2FullName); } LOGGER.info("Checking equality of table partitions"); HCatClient hcatClient1 = cluster1.getClusterHelper().getHCatClient(); HCatClient hcatClient2 = cluster2.getClusterHelper().getHCatClient(); final List<HCatPartition> table1Partitions = hcatClient1.getPartitions(table1.getDbName(), table1.getTableName()); final List<HCatPartition> table2Partitions = hcatClient2.getPartitions(table2.getDbName(), table2.getTableName()); assertPartitionListEqual(table1Partitions, table2Partitions, softAssert); if (notIgnoreTblTypeAndProps) { softAssert.assertEquals(cluster2FS.getContentSummary(new Path(table2.getLocation())).getLength(), cluster1FS.getContentSummary(new Path(table1.getLocation())).getLength(), "Size of content for table1 and table2 are different"); } //table content equality LOGGER.info("Checking equality of table contents"); Statement jdbcStmt1 = null, jdbcStmt2 = null; try { final boolean execute1; final boolean execute2; jdbcStmt1 = cluster1.getClusterHelper().getHiveJdbcConnection().createStatement(); jdbcStmt2 = cluster2.getClusterHelper().getHiveJdbcConnection().createStatement(); execute1 = jdbcStmt1.execute("select * from " + table1FullName); execute2 = jdbcStmt2.execute("select * from " + table2FullName); softAssert.assertEquals(execute2, execute1, "Table " + table1FullName + " has different result of select * from " + table2FullName); if (execute1 && execute2) { final ResultSet resultSet1 = jdbcStmt1.getResultSet(); final ResultSet resultSet2 = jdbcStmt2.getResultSet(); final List<String> rows1 = HiveUtil.fetchRows(resultSet1); final List<String> rows2 = HiveUtil.fetchRows(resultSet2); softAssert.assertEquals(rows2, rows1, "Table " + table1FullName + " has different content from " + table2FullName); } } catch (SQLException e) { softAssert.fail("Comparison of content of table " + table1FullName + " with content of table " + table2FullName + " failed because of exception\n" + ExceptionUtils.getFullStackTrace(e)); } finally { if (jdbcStmt1 != null) { try { jdbcStmt1.close(); } catch (SQLException e) { LOGGER.warn("Closing of jdbcStmt1 failed: " + ExceptionUtils.getFullStackTrace(e)); } } if (jdbcStmt2 != null) { try { jdbcStmt2.close(); } catch (SQLException e) { LOGGER.warn("Closing of jdbcStmt2 failed: " + ExceptionUtils.getFullStackTrace(e)); } } } return softAssert; }
From source file:org.apache.falcon.regression.ExternalFSTest.java
License:Apache License
@Test(dataProvider = "getData") public void replicateToExternalFS(final FileSystem externalFS, final String separator, final boolean withData) throws Exception { final String endpoint = externalFS.getUri().toString(); Bundle.submitCluster(bundles[0], externalBundle); String startTime = TimeUtil.getTimeWrtSystemTime(0); String endTime = TimeUtil.addMinsToTime(startTime, 5); LOGGER.info("Time range between : " + startTime + " and " + endTime); String datePattern = StringUtils .join(new String[] { "${YEAR}", "${MONTH}", "${DAY}", "${HOUR}", "${MINUTE}" }, separator); //configure feed FeedMerlin feed = new FeedMerlin(bundles[0].getDataSets().get(0)); String targetDataLocation = endpoint + testWasbTargetDir + datePattern; feed.setFilePath(sourcePath + '/' + datePattern); //erase all clusters from feed definition feed.clearFeedClusters();//w ww . j a v a 2s . c o m //set local cluster as source feed.addFeedCluster(new FeedMerlin.FeedClusterBuilder(Util.readEntityName(bundles[0].getClusters().get(0))) .withRetention("days(1000000)", ActionType.DELETE).withValidity(startTime, endTime) .withClusterType(ClusterType.SOURCE).build()); //set externalFS cluster as target feed.addFeedCluster( new FeedMerlin.FeedClusterBuilder(Util.readEntityName(externalBundle.getClusters().get(0))) .withRetention("days(1000000)", ActionType.DELETE).withValidity(startTime, endTime) .withClusterType(ClusterType.TARGET).withDataLocation(targetDataLocation).build()); //submit and schedule feed LOGGER.info("Feed : " + Util.prettyPrintXml(feed.toString())); AssertUtil.assertSucceeded(prism.getFeedHelper().submitAndSchedule(feed.toString())); datePattern = StringUtils.join(new String[] { "yyyy", "MM", "dd", "HH", "mm" }, separator); //upload necessary data DateTime date = new DateTime(startTime, DateTimeZone.UTC); DateTimeFormatter fmt = DateTimeFormat.forPattern(datePattern); String timePattern = fmt.print(date); HadoopUtil.recreateDir(clusterFS, sourcePath + '/' + timePattern); if (withData) { HadoopUtil.copyDataToFolder(clusterFS, sourcePath + '/' + timePattern, OSUtil.SINGLE_FILE); } Path srcPath = new Path(sourcePath + '/' + timePattern); Path dstPath = new Path(endpoint + testWasbTargetDir + '/' + timePattern); //check if coordinator exists TimeUtil.sleepSeconds(10); InstanceUtil.waitTillInstancesAreCreated(clusterOC, feed.toString(), 0); Assert.assertEquals(OozieUtil.checkIfFeedCoordExist(clusterOC, feed.getName(), "REPLICATION"), 1); //replication should start, wait while it ends InstanceUtil.waitTillInstanceReachState(clusterOC, Util.readEntityName(feed.toString()), 1, CoordinatorAction.Status.SUCCEEDED, EntityType.FEED); //check if data has been replicated correctly List<Path> cluster1ReplicatedData = HadoopUtil.getAllFilesRecursivelyHDFS(clusterFS, srcPath); List<Path> cluster2ReplicatedData = HadoopUtil.getAllFilesRecursivelyHDFS(externalFS, dstPath); AssertUtil.checkForListSizes(cluster1ReplicatedData, cluster2ReplicatedData); final ContentSummary srcSummary = clusterFS.getContentSummary(srcPath); final ContentSummary dstSummary = externalFS.getContentSummary(dstPath); Assert.assertEquals(dstSummary.getLength(), srcSummary.getLength()); }
From source file:org.apache.falcon.workflow.LateDataHandler.java
License:Apache License
private long usage(Path inPath, Configuration conf) throws IOException, FalconException { FileSystem fs = HadoopClientFactory.get().createProxiedFileSystem(inPath.toUri(), conf); FileStatus[] fileStatuses = fs.globStatus(inPath); if (fileStatuses == null || fileStatuses.length == 0) { return 0; }//from ww w . jav a 2 s . co m long totalSize = 0; for (FileStatus fileStatus : fileStatuses) { totalSize += fs.getContentSummary(fileStatus.getPath()).getLength(); } return totalSize; }
From source file:org.apache.ignite.igfs.HadoopIgfs20FileSystemAbstractSelfTest.java
License:Apache License
/** * Test expected failures for 'get content summary' operation. * * @param fs File system to test.// w w w. ja va 2 s. c om * @param path Path to evaluate content summary for. */ private void assertContentSummaryFails(final FileSystem fs, final Path path) { GridTestUtils.assertThrows(log, new Callable<ContentSummary>() { @Override public ContentSummary call() throws Exception { return fs.getContentSummary(path); } }, FileNotFoundException.class, null); }
From source file:org.apache.ivory.latedata.LateDataHandler.java
License:Apache License
public long usage(Path inPath, Configuration conf) throws IOException { FileSystem fs = inPath.getFileSystem(conf); FileStatus status[] = fs.globStatus(inPath); if (status == null || status.length == 0) { return 0; }//from w w w .j a va 2s .com long totalSize = 0; for (FileStatus statu : status) { totalSize += fs.getContentSummary(statu.getPath()).getLength(); } return totalSize; }
From source file:org.apache.kylin.source.hive.CreateFlatHiveTableStep.java
License:Apache License
private long getFileSize(String hdfsUrl) throws IOException { Configuration configuration = new Configuration(); Path path = new Path(hdfsUrl); FileSystem fs = path.getFileSystem(configuration); ContentSummary contentSummary = fs.getContentSummary(path); long length = contentSummary.getLength(); return length; }
From source file:org.apache.lens.server.query.ResultFormatter.java
License:Apache License
/** * Format output.//from w w w . ja v a 2s .c om * * @param ctx the query context */ private void formatOutput(QueryContext ctx) { QueryHandle queryHandle = ctx.getQueryHandle(); this.logSegregationContext.setLogSegragationAndQueryId(ctx.getQueryHandleString()); try { if (!ctx.isPersistent()) { log.info("No result formatting required for query " + queryHandle); return; } if (ctx.isResultAvailableInDriver()) { log.info("Result formatter for {}", queryHandle); LensResultSet resultSet = queryService.getDriverResultset(queryHandle); boolean isPersistedInDriver = resultSet instanceof PersistentResultSet; if (isPersistedInDriver) { // skip result formatting if persisted size is huge Path persistedDirectory = new Path(ctx.getDriverResultPath()); FileSystem fs = persistedDirectory.getFileSystem(ctx.getConf()); long size = fs.getContentSummary(persistedDirectory).getLength(); long threshold = ctx.getConf().getLong(LensConfConstants.RESULT_FORMAT_SIZE_THRESHOLD, LensConfConstants.DEFAULT_RESULT_FORMAT_SIZE_THRESHOLD); log.info(" size :{} threshold:{}", size, threshold); if (size > threshold) { log.warn( "Persisted result size more than the threshold, size:{} and threshold:{}; Skipping formatter", size, threshold); queryService.setSuccessState(ctx); return; } } // now do the formatting createAndSetFormatter(ctx, isPersistedInDriver); QueryOutputFormatter formatter = ctx.getQueryOutputFormatter(); try { formatter.init(ctx, resultSet.getMetadata()); if (ctx.getConf().getBoolean(LensConfConstants.QUERY_OUTPUT_WRITE_HEADER, LensConfConstants.DEFAULT_OUTPUT_WRITE_HEADER)) { formatter.writeHeader(); } if (isPersistedInDriver) { log.info("Result formatter for {} in persistent result", queryHandle); Path persistedDirectory = new Path(ctx.getDriverResultPath()); // write all files from persistent directory ((PersistedOutputFormatter) formatter).addRowsFromPersistedPath(persistedDirectory); } else { log.info("Result formatter for {} in inmemory result", queryHandle); InMemoryResultSet inmemory = (InMemoryResultSet) resultSet; while (inmemory.hasNext()) { ((InMemoryOutputFormatter) formatter).writeRow(inmemory.next()); } inmemory.setFullyAccessed(true); } if (ctx.getConf().getBoolean(LensConfConstants.QUERY_OUTPUT_WRITE_FOOTER, LensConfConstants.DEFAULT_OUTPUT_WRITE_FOOTER)) { formatter.writeFooter(); } formatter.commit(); } finally { formatter.close(); } queryService.setSuccessState(ctx); log.info("Result formatter has completed. Final path:{}", formatter.getFinalOutputPath()); } } catch (Exception e) { MetricsService metricsService = LensServices.get().getService(MetricsService.NAME); metricsService.incrCounter(ResultFormatter.class, "formatting-errors"); log.warn("Exception while formatting result for {}", queryHandle, e); try { // set output formatter to null so that server restart is faster in case this query is not purged. ctx.setQueryOutputFormatter(null); queryService.setFailedStatus(ctx, ERROR_MESSAGE, e); } catch (LensException e1) { log.error("Exception while setting failure for {}", queryHandle, e1); } } }
From source file:org.apache.metamodel.util.HdfsResource.java
License:Apache License
@Override public long getSize() { final FileSystem fs = getHadoopFileSystem(); try {// w w w.j a v a 2 s .c om if (fs.isFile(getHadoopPath())) { return fs.getFileStatus(getHadoopPath()).getLen(); } else { return fs.getContentSummary(getHadoopPath()).getLength(); } } catch (Exception e) { throw wrapException(e); } finally { FileHelper.safeClose(fs); } }
From source file:org.apache.orc.tools.JsonFileDump.java
License:Apache License
public static void printJsonMetaData(List<String> files, Configuration conf, List<Integer> rowIndexCols, boolean prettyPrint, boolean printTimeZone) throws JSONException, IOException { if (files.isEmpty()) { return;/* w w w. j ava2s .c o m*/ } JSONStringer writer = new JSONStringer(); boolean multiFile = files.size() > 1; if (multiFile) { writer.array(); } else { writer.object(); } for (String filename : files) { try { if (multiFile) { writer.object(); } writer.key("fileName").value(filename); Path path = new Path(filename); Reader reader = FileDump.getReader(path, conf, null); if (reader == null) { writer.key("status").value("FAILED"); continue; } writer.key("fileVersion").value(reader.getFileVersion().getName()); writer.key("writerVersion").value(reader.getWriterVersion()); RecordReaderImpl rows = (RecordReaderImpl) reader.rows(); writer.key("numberOfRows").value(reader.getNumberOfRows()); writer.key("compression").value(reader.getCompressionKind()); if (reader.getCompressionKind() != CompressionKind.NONE) { writer.key("compressionBufferSize").value(reader.getCompressionSize()); } writer.key("schemaString").value(reader.getSchema().toString()); writer.key("schema").array(); writeSchema(writer, reader.getTypes()); writer.endArray(); writer.key("stripeStatistics").array(); List<StripeStatistics> stripeStatistics = reader.getStripeStatistics(); for (int n = 0; n < stripeStatistics.size(); n++) { writer.object(); writer.key("stripeNumber").value(n + 1); StripeStatistics ss = stripeStatistics.get(n); writer.key("columnStatistics").array(); for (int i = 0; i < ss.getColumnStatistics().length; i++) { writer.object(); writer.key("columnId").value(i); writeColumnStatistics(writer, ss.getColumnStatistics()[i]); writer.endObject(); } writer.endArray(); writer.endObject(); } writer.endArray(); ColumnStatistics[] stats = reader.getStatistics(); int colCount = stats.length; if (rowIndexCols == null) { rowIndexCols = new ArrayList<>(colCount); for (int i = 0; i < colCount; ++i) { rowIndexCols.add(i); } } writer.key("fileStatistics").array(); for (int i = 0; i < stats.length; ++i) { writer.object(); writer.key("columnId").value(i); writeColumnStatistics(writer, stats[i]); writer.endObject(); } writer.endArray(); writer.key("stripes").array(); int stripeIx = -1; for (StripeInformation stripe : reader.getStripes()) { ++stripeIx; long stripeStart = stripe.getOffset(); OrcProto.StripeFooter footer = rows.readStripeFooter(stripe); writer.object(); // start of stripe information writer.key("stripeNumber").value(stripeIx + 1); writer.key("stripeInformation"); writeStripeInformation(writer, stripe); if (printTimeZone) { writer.key("writerTimezone") .value(footer.hasWriterTimezone() ? footer.getWriterTimezone() : FileDump.UNKNOWN); } long sectionStart = stripeStart; writer.key("streams").array(); for (OrcProto.Stream section : footer.getStreamsList()) { writer.object(); String kind = section.hasKind() ? section.getKind().name() : FileDump.UNKNOWN; writer.key("columnId").value(section.getColumn()); writer.key("section").value(kind); writer.key("startOffset").value(sectionStart); writer.key("length").value(section.getLength()); sectionStart += section.getLength(); writer.endObject(); } writer.endArray(); writer.key("encodings").array(); for (int i = 0; i < footer.getColumnsCount(); ++i) { writer.object(); OrcProto.ColumnEncoding encoding = footer.getColumns(i); writer.key("columnId").value(i); writer.key("kind").value(encoding.getKind()); if (encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY || encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY_V2) { writer.key("dictionarySize").value(encoding.getDictionarySize()); } writer.endObject(); } writer.endArray(); if (!rowIndexCols.isEmpty()) { // include the columns that are specified, only if the columns are included, bloom filter // will be read boolean[] sargColumns = new boolean[colCount]; for (int colIdx : rowIndexCols) { sargColumns[colIdx] = true; } OrcIndex indices = rows.readRowIndex(stripeIx, null, sargColumns); writer.key("indexes").array(); for (int col : rowIndexCols) { writer.object(); writer.key("columnId").value(col); writeRowGroupIndexes(writer, col, indices.getRowGroupIndex()); writeBloomFilterIndexes(writer, col, indices.getBloomFilterIndex()); writer.endObject(); } writer.endArray(); } writer.endObject(); // end of stripe information } writer.endArray(); FileSystem fs = path.getFileSystem(conf); long fileLen = fs.getContentSummary(path).getLength(); long paddedBytes = FileDump.getTotalPaddingSize(reader); // empty ORC file is ~45 bytes. Assumption here is file length always >0 double percentPadding = ((double) paddedBytes / (double) fileLen) * 100; writer.key("fileLength").value(fileLen); writer.key("paddingLength").value(paddedBytes); writer.key("paddingRatio").value(percentPadding); AcidStats acidStats = OrcAcidUtils.parseAcidStats(reader); if (acidStats != null) { writer.key("numInserts").value(acidStats.inserts); writer.key("numDeletes").value(acidStats.deletes); writer.key("numUpdates").value(acidStats.updates); } writer.key("status").value("OK"); rows.close(); writer.endObject(); } catch (Exception e) { writer.key("status").value("FAILED"); throw e; } } if (multiFile) { writer.endArray(); } if (prettyPrint) { final String prettyJson; if (multiFile) { JSONArray jsonArray = new JSONArray(writer.toString()); prettyJson = jsonArray.toString(2); } else { JSONObject jsonObject = new JSONObject(writer.toString()); prettyJson = jsonObject.toString(2); } System.out.println(prettyJson); } else { System.out.println(writer.toString()); } }
From source file:org.apache.sysml.runtime.controlprogram.parfor.opt.OptimizerRuleBased.java
License:Apache License
/** * Increasing the partition replication factor is beneficial if partitions are * read multiple times (e.g., in nested loops) because partitioning (done once) * gets slightly slower but there is a higher probability for local access * // w ww . j a va 2 s .c o m * NOTE: this rewrite requires 'set data partitioner' to be executed in order to * leverage the partitioning information in the plan tree. * * @param n internal representation of a plan alternative for program blocks and instructions * @param partitionedMatrices map of data partition formats * @param vars local variable map * @throws DMLRuntimeException if DMLRuntimeException occurs */ protected void rewriteSetPartitionReplicationFactor(OptNode n, HashMap<String, PartitionFormat> partitionedMatrices, LocalVariableMap vars) throws DMLRuntimeException { boolean apply = false; double sizeReplicated = 0; int replication = ParForProgramBlock.WRITE_REPLICATION_FACTOR; ParForProgramBlock pfpb = (ParForProgramBlock) OptTreeConverter.getAbstractPlanMapping() .getMappedProg(n.getID())[1]; if (((n.getExecType() == ExecType.MR && n.getParam(ParamType.DATA_PARTITIONER).equals(PDataPartitioner.REMOTE_MR.name())) || (n.getExecType() == ExecType.SPARK && n.getParam(ParamType.DATA_PARTITIONER).equals(PDataPartitioner.REMOTE_SPARK.name()))) && n.hasNestedParallelism(false) && n.hasNestedPartitionReads(false)) { apply = true; //account for problem and cluster constraints replication = (int) Math.min(_N, _rnk); //account for internal max constraint (note hadoop will warn if max > 10) replication = (int) Math.min(replication, MAX_REPLICATION_FACTOR_PARTITIONING); //account for remaining hdfs capacity try { FileSystem fs = FileSystem.get(ConfigurationManager.getCachedJobConf()); long hdfsCapacityRemain = fs.getStatus().getRemaining(); long sizeInputs = 0; //sum of all input sizes (w/o replication) for (String var : partitionedMatrices.keySet()) { MatrixObject mo = (MatrixObject) vars.get(var); Path fname = new Path(mo.getFileName()); if (fs.exists(fname)) //non-existing (e.g., CP) -> small file sizeInputs += fs.getContentSummary(fname).getLength(); } replication = (int) Math.min(replication, Math.floor(0.9 * hdfsCapacityRemain / sizeInputs)); //ensure at least replication 1 replication = Math.max(replication, ParForProgramBlock.WRITE_REPLICATION_FACTOR); sizeReplicated = replication * sizeInputs; } catch (Exception ex) { throw new DMLRuntimeException("Failed to analyze remaining hdfs capacity.", ex); } } //modify the runtime plan if (apply) pfpb.setPartitionReplicationFactor(replication); _numEvaluatedPlans++; LOG.debug(getOptMode() + " OPT: rewrite 'set partition replication factor' - result=" + apply + ((apply) ? " (" + replication + ", " + toMB(sizeReplicated) + ")" : "")); }