List of usage examples for org.apache.commons.io FileUtils ONE_GB
long ONE_GB
To view the source code for org.apache.commons.io FileUtils ONE_GB.
Click Source Link
From source file:io.ecarf.core.cloud.task.processor.reason.phase2.DoReasonTask9.java
@Override public void run() throws IOException { GoogleCloudService cloud = (GoogleCloudService) this.getCloudService(); Stopwatch stopwatch1 = Stopwatch.createUnstarted(); Stopwatch stopwatch2 = Stopwatch.createUnstarted(); this.setup(cloud); String decoratedTable = table; int emptyRetries = 0; int totalInferredTriples = 0; int maxRetries; if (this.retries == null) { maxRetries = Config.getIntegerProperty(Constants.REASON_RETRY_KEY, 6); } else {//from www. j ava2 s.co m maxRetries = this.retries; } int cycleSleep; if (this.sleep == null) { cycleSleep = Config.getIntegerProperty(Constants.REASON_SLEEP_KEY, 20); } else { cycleSleep = this.sleep; } this.ddLimit = Config.getIntegerProperty(Constants.REASON_DATA_DIRECT_DOWNLOAD_LIMIT, 1_200_000); int streamingThreshold = Config.getIntegerProperty("ecarf.io.reasoning.streaming.threshold", 100000); String instanceId = cloud.getInstanceId(); int processors = Runtime.getRuntime().availableProcessors(); if (processors > 1) { this.executor = Utils.createFixedThreadPool(processors); } int count = 0; QueryGenerator<Long> generator = new QueryGenerator<Long>(schemaTerms, null); // timestamp loop do { // First of all run all the queries asynchronously and remember the jobId and filename for each term generator.setDecoratedTable(decoratedTable); String query = generator.getQuery(); log.debug("Generated Query: " + query); String queryResultFilePrefix = instanceId + "_QueryResults_" + count; String jobId = cloud.startBigDataQuery(query, new BigDataTable(this.table)); //QueryResult queryResult = QueryResult.create().setFilename(queryResultFilePrefix).setJobId(jobId); long start = System.currentTimeMillis(); // block and wait for each job to complete then save results to a file QueryStats stats = cloud.saveBigQueryResultsToFile(jobId, queryResultFilePrefix, this.bucket, processors, this.ddLimit); BigInteger rows = stats.getTotalRows(); this.totalBytes = this.totalBytes + stats.getTotalProcessedBytes(); Set<Long> productiveTerms = new HashSet<>(); Set<String> inferredTriplesFiles = new HashSet<>(); int interimInferredTriples = 0; // only process if triples are found matching this term if ((rows != null) && !BigInteger.ZERO.equals(rows)) { stopwatch1.start(); interimInferredTriples = this.inferAndSaveTriplesToFile(stats, productiveTerms, processors, inferredTriplesFiles); this.totalRows = this.totalRows.add(rows); stopwatch1.stop(); } else { log.info("Skipping query as no data is found"); } totalInferredTriples += interimInferredTriples; if (interimInferredTriples > 0) { // stream smaller numbers of inferred triples // try uploading from cloud storage log.info("Inserting " + interimInferredTriples + ", inferred triples into Big Data table for " + productiveTerms.size() + " productive terms. Filename: " + inferredTriplesFiles); if (interimInferredTriples <= streamingThreshold) { // stream the data Set<Triple> inferredTriples = new HashSet<>(); for (String inferredTriplesFile : inferredTriplesFiles) { TripleUtils.loadCompressedCSVTriples(inferredTriplesFile, true, inferredTriples); } log.info("Total triples to stream into Big Data: " + inferredTriples.size()); cloud.streamObjectsIntoBigData(inferredTriples, TableUtils.getBigQueryEncodedTripleTable(table)); log.info("All inferred triples are streamed into Big Data table"); } else { List<String> cloudStorageFiles = new ArrayList<>(); // load the data through cloud storage // upload the file to cloud storage for (String inferredTriplesFile : inferredTriplesFiles) { log.info("Uploading inferred triples file into cloud storage: " + inferredTriplesFile); StorageObject file = cloud.uploadFileToCloudStorage(inferredTriplesFile, bucket); log.info("File " + file + ", uploaded successfully. Now loading it into big data."); cloudStorageFiles.add(file.getUri()); } jobId = cloud.loadCloudStorageFilesIntoBigData(cloudStorageFiles, TableUtils.getBigQueryEncodedTripleTable(table), false); log.info( "All inferred triples are loaded into Big Data table through cloud storage, completed jobId: " + jobId); } // reset empty retries emptyRetries = 0; stopwatch2.reset(); } else { log.info("No new inferred triples"); // increment empty retries emptyRetries++; if (!stopwatch2.isRunning()) { stopwatch2.start(); } } log.info("Total inferred triples so far = " + totalInferredTriples + ", current retry count: " + emptyRetries); if (emptyRetries < maxRetries) { ApiUtils.block(cycleSleep); // FIXME move into the particular cloud implementation service long elapsed = System.currentTimeMillis() - start; decoratedTable = "[" + table + "@-" + elapsed + "-]"; log.info("Using table decorator: " + decoratedTable + ". Empty retries count: " + emptyRetries); } count++; } while (emptyRetries < maxRetries); // end timestamp loop executor.shutdown(); log.info("Finished reasoning, total inferred triples = " + totalInferredTriples); //log.info("Number of avoided duplicate terms = " + this.duplicates); log.info("Total rows retrieved from big data = " + this.totalRows); log.info("Total processed GBytes = " + ((double) this.totalBytes / FileUtils.ONE_GB)); log.info("Total process reasoning time (serialization in inf file) = " + stopwatch1); log.info("Total time spent in empty inference cycles = " + stopwatch2); }
From source file:io.ecarf.core.cloud.task.processor.reason.phase1.DoReasonTask5.java
@Override public void run() throws IOException { GoogleCloudService cloud = (GoogleCloudService) this.getCloudService(); //String table = metadata.getValue(EcarfMetaData.ECARF_TABLE); //Set<String> terms = metadata.getTerms(); //String schemaFile = metadata.getValue(EcarfMetaData.ECARF_SCHEMA); //String bucket = metadata.getBucket(); Stopwatch stopwatch1 = Stopwatch.createUnstarted(); Stopwatch stopwatch2 = Stopwatch.createUnstarted(); Set<String> termsSet; if (terms == null) { // too large, probably saved as a file //String termsFile = metadata.getValue(EcarfMetaData.ECARF_TERMS_FILE); log.info("Using json file for terms: " + termsFile); Validate.notNull(termsFile);/*w w w.j a v a2s.co m*/ String localTermsFile = Utils.TEMP_FOLDER + termsFile; cloud.downloadObjectFromCloudStorage(termsFile, localTermsFile, bucket); // convert from JSON termsSet = io.cloudex.framework.utils.FileUtils.jsonFileToSet(localTermsFile); } else { termsSet = ObjectUtils.csvToSet(terms); } String localSchemaFile = Utils.TEMP_FOLDER + schemaFile; // download the file from the cloud storage cloud.downloadObjectFromCloudStorage(schemaFile, localSchemaFile, bucket); // uncompress if compressed if (GzipUtils.isCompressedFilename(schemaFile)) { localSchemaFile = GzipUtils.getUncompressedFilename(localSchemaFile); } Map<String, Set<Triple>> allSchemaTriples = TripleUtils.getRelevantSchemaNTriples(localSchemaFile, TermUtils.RDFS_TBOX); // get all the triples we care about Map<Term, Set<Triple>> schemaTerms = new HashMap<>(); for (String term : termsSet) { if (allSchemaTriples.containsKey(term)) { schemaTerms.put(new Term(term), allSchemaTriples.get(term)); } } String decoratedTable = table; int emptyRetries = 0; int totalInferredTriples = 0; int maxRetries = Config.getIntegerProperty(Constants.REASON_RETRY_KEY, 6); String instanceId = cloud.getInstanceId(); // timestamp loop do { List<String> productiveTerms = new ArrayList<>(); int interimInferredTriples = 0; // First of all run all the queries asynchronously and remember the jobId and filename for each term List<Callable<Void>> queryTasks = new ArrayList<>(); List<Callable<Void>> saveTasks = new ArrayList<>(); for (Entry<Term, Set<Triple>> entry : schemaTerms.entrySet()) { Term term = entry.getKey(); Set<Triple> triples = entry.getValue(); QuerySubTask queryTask = new QuerySubTask(term, triples, decoratedTable, cloud); queryTasks.add(queryTask); SaveResultsSubTask saveTask = new SaveResultsSubTask(term, cloud); saveTasks.add(saveTask); } // invoke all the queries in parallel this.invokeAll(queryTasks); long start = System.currentTimeMillis(); String inferredTriplesFile = Utils.TEMP_FOLDER + instanceId + '_' + start + Constants.DOT_INF; // save all the query results in files in parallel this.invokeAll(saveTasks); try (PrintWriter writer = new PrintWriter( new GZIPOutputStream(new FileOutputStream(inferredTriplesFile), Constants.GZIP_BUF_SIZE))) { // now loop through the queries for (Entry<Term, Set<Triple>> entry : schemaTerms.entrySet()) { Term term = entry.getKey(); BigInteger rows = term.getRows(); this.totalBytes = this.totalBytes + term.getBytes(); // only process if triples are found matching this term if (!BigInteger.ZERO.equals(rows)) { stopwatch1.start(); log.info("Reasoning for Term: " + term); Set<Triple> schemaTriples = entry.getValue(); log.info("Schema Triples: " + Joiner.on('\n').join(schemaTriples)); List<String> select = GenericRule.getSelect(schemaTriples); int inferredTriplesCount = this.inferAndSaveTriplesToFile(term, select, schemaTriples, rows, decoratedTable, writer); productiveTerms.add(term.getTerm()); interimInferredTriples += inferredTriplesCount; this.totalRows = this.totalRows.add(rows); stopwatch1.stop(); } else { log.info("Skipping term as no data found: " + term); } } } totalInferredTriples += interimInferredTriples; if (interimInferredTriples > 0) { // stream smaller numbers of inferred triples // try uploading from cloud storage int streamingThreshold = Config.getIntegerProperty("ecarf.io.reasoning.streaming.threshold", 100000); log.info("Inserting " + interimInferredTriples + ", inferred triples into Big Data table for " + productiveTerms.size() + " productive terms. Filename: " + inferredTriplesFile); if (interimInferredTriples <= streamingThreshold) { // stream the data Set<Triple> inferredTriples = TripleUtils.loadCompressedCSVTriples(inferredTriplesFile, false); log.info("Total triples to stream into Big Data: " + inferredTriples.size()); cloud.streamObjectsIntoBigData(inferredTriples, TableUtils.getBigQueryTripleTable(table)); log.info("All inferred triples are streamed into Big Data table"); } else { // load the data through cloud storage // upload the file to cloud storage log.info("Uploading inferred triples file into cloud storage: " + inferredTriplesFile); StorageObject file = cloud.uploadFileToCloudStorage(inferredTriplesFile, bucket); log.info("File " + file + ", uploaded successfully. Now loading it into big data."); String jobId = cloud.loadCloudStorageFilesIntoBigData(Lists.newArrayList(file.getUri()), TableUtils.getBigQueryTripleTable(table), false); log.info( "All inferred triples are loaded into Big Data table through cloud storage, completed jobId: " + jobId); } // reset empty retries emptyRetries = 0; stopwatch2.reset(); } else { log.info("No new inferred triples"); // increment empty retries emptyRetries++; if (!stopwatch2.isRunning()) { stopwatch2.start(); } } log.info("Total inferred triples so far = " + totalInferredTriples + ", current retry count: " + emptyRetries); if (emptyRetries < maxRetries) { ApiUtils.block(Config.getIntegerProperty(Constants.REASON_SLEEP_KEY, 20)); // FIXME move into the particular cloud implementation service long elapsed = System.currentTimeMillis() - start; decoratedTable = "[" + table + "@-" + elapsed + "-]"; log.info("Using table decorator: " + decoratedTable + ". Empty retries count: " + emptyRetries); } } while (emptyRetries < maxRetries); // end timestamp loop executor.shutdown(); log.info("Finished reasoning, total inferred triples = " + totalInferredTriples); log.info("Number of avoided duplicate terms = " + this.duplicates); log.info("Total rows retrieved from big data = " + this.totalRows); log.info("Total processed GBytes = " + ((double) this.totalBytes / FileUtils.ONE_GB)); log.info("Total process reasoning time (serialization in inf file) = " + stopwatch1); log.info("Total time spent in empty inference cycles = " + stopwatch2); }
From source file:io.ecarf.core.cloud.task.processor.reason.phase2.DoReasonTask7.java
@Override public void run() throws IOException { GoogleCloudService cloud = (GoogleCloudService) this.getCloudService(); Stopwatch stopwatch1 = Stopwatch.createUnstarted(); Stopwatch stopwatch2 = Stopwatch.createUnstarted(); Set<String> termsSet; if (terms == null) { // too large, probably saved as a file log.info("Using json file for terms: " + termsFile); Validate.notNull(termsFile);//from w w w.j a v a2s . c o m String localTermsFile = Utils.TEMP_FOLDER + termsFile; cloud.downloadObjectFromCloudStorage(termsFile, localTermsFile, bucket); // convert from JSON termsSet = io.cloudex.framework.utils.FileUtils.jsonFileToSet(localTermsFile); } else { termsSet = ObjectUtils.csvToSet(terms); } String localSchemaFile = Utils.TEMP_FOLDER + schemaFile; // download the file from the cloud storage cloud.downloadObjectFromCloudStorage(schemaFile, localSchemaFile, bucket); // uncompress if compressed if (GzipUtils.isCompressedFilename(schemaFile)) { localSchemaFile = GzipUtils.getUncompressedFilename(localSchemaFile); } Map<Long, Set<Triple>> allSchemaTriples = TripleUtils.getRelevantSchemaETriples(localSchemaFile, TermUtils.RDFS_TBOX); // get all the triples we care about schemaTerms = new HashMap<>(); for (String termStr : termsSet) { Long term = Long.parseLong(termStr); if (allSchemaTriples.containsKey(term)) { schemaTerms.put(term, allSchemaTriples.get(term)); } } String decoratedTable = table; int emptyRetries = 0; int totalInferredTriples = 0; int maxRetries = Config.getIntegerProperty(Constants.REASON_RETRY_KEY, 6); String instanceId = cloud.getInstanceId(); QueryGenerator<Long> generator = new QueryGenerator<Long>(schemaTerms, null); // timestamp loop do { Set<Long> productiveTerms = new HashSet<>(); int interimInferredTriples = 0; // First of all run all the queries asynchronously and remember the jobId and filename for each term List<QueryResult> queryResults = new ArrayList<QueryResult>(); generator.setDecoratedTable(decoratedTable); List<String> queries = generator.getQueries(); log.debug("Generated Queries: " + queries); String queryResultFilePrefix = Utils.TEMP_FOLDER + instanceId + '_' + System.currentTimeMillis() + "_QueryResults_"; int fileCount = 0; for (String query : queries) { String jobId = cloud.startBigDataQuery(query); queryResults .add(QueryResult.create().setFilename(queryResultFilePrefix + fileCount).setJobId(jobId)); fileCount++; } // invoke all the queries in parallel //this.invokeAll(queryTasks); long start = System.currentTimeMillis(); String inferredTriplesFile = Utils.TEMP_FOLDER + instanceId + '_' + start + Constants.DOT_INF; // save all the query results in files in parallel //this.invokeAll(saveTasks); for (QueryResult queryResult : queryResults) { try { // block and wait for each job to complete then save results to a file QueryStats stats = cloud.saveBigQueryResultsToFile(queryResult.getJobId(), queryResult.getFilename()); queryResult.setStats(stats); } catch (IOException ioe) { // transient backend errors log.warn("failed to save query results to file, jobId: " + queryResult.getJobId(), ioe); //TODO should throw an exception } } try (PrintWriter writer = new PrintWriter( new GZIPOutputStream(new FileOutputStream(inferredTriplesFile), Constants.GZIP_BUF_SIZE))) { // now loop through the queries //for(Entry<Term, Set<Triple>> entry: schemaTerms.entrySet()) { for (QueryResult queryResult : queryResults) { //Term term = entry.getKey(); QueryStats stats = queryResult.getStats(); BigInteger rows = stats.getTotalRows();//term.getRows(); this.totalBytes = this.totalBytes + stats.getTotalProcessedBytes();//term.getBytes(); // only process if triples are found matching this term if (!BigInteger.ZERO.equals(rows)) { stopwatch1.start(); int inferredTriplesCount = this.inferAndSaveTriplesToFile(queryResult, productiveTerms, decoratedTable, writer); interimInferredTriples += inferredTriplesCount; this.totalRows = this.totalRows.add(rows); stopwatch1.stop(); } else { log.info("Skipping query as no data is found"); } } } totalInferredTriples += interimInferredTriples; if (interimInferredTriples > 0) { // stream smaller numbers of inferred triples // try uploading from cloud storage int streamingThreshold = Config.getIntegerProperty("ecarf.io.reasoning.streaming.threshold", 100000); log.info("Inserting " + interimInferredTriples + ", inferred triples into Big Data table for " + productiveTerms.size() + " productive terms. Filename: " + inferredTriplesFile); if (interimInferredTriples <= streamingThreshold) { // stream the data Set<Triple> inferredTriples = TripleUtils.loadCompressedCSVTriples(inferredTriplesFile, true); log.info("Total triples to stream into Big Data: " + inferredTriples.size()); cloud.streamObjectsIntoBigData(inferredTriples, TableUtils.getBigQueryEncodedTripleTable(table)); log.info("All inferred triples are streamed into Big Data table"); } else { // load the data through cloud storage // upload the file to cloud storage log.info("Uploading inferred triples file into cloud storage: " + inferredTriplesFile); StorageObject file = cloud.uploadFileToCloudStorage(inferredTriplesFile, bucket); log.info("File " + file + ", uploaded successfully. Now loading it into big data."); String jobId = cloud.loadCloudStorageFilesIntoBigData(Lists.newArrayList(file.getUri()), TableUtils.getBigQueryEncodedTripleTable(table), false); log.info( "All inferred triples are loaded into Big Data table through cloud storage, completed jobId: " + jobId); } // reset empty retries emptyRetries = 0; stopwatch2.reset(); } else { log.info("No new inferred triples"); // increment empty retries emptyRetries++; if (!stopwatch2.isRunning()) { stopwatch2.start(); } } log.info("Total inferred triples so far = " + totalInferredTriples + ", current retry count: " + emptyRetries); if (emptyRetries < maxRetries) { ApiUtils.block(Config.getIntegerProperty(Constants.REASON_SLEEP_KEY, 20)); // FIXME move into the particular cloud implementation service long elapsed = System.currentTimeMillis() - start; decoratedTable = "[" + table + "@-" + elapsed + "-]"; log.info("Using table decorator: " + decoratedTable + ". Empty retries count: " + emptyRetries); } } while (emptyRetries < maxRetries); // end timestamp loop //executor.shutdown(); log.info("Finished reasoning, total inferred triples = " + totalInferredTriples); //log.info("Number of avoided duplicate terms = " + this.duplicates); log.info("Total rows retrieved from big data = " + this.totalRows); log.info("Total processed GBytes = " + ((double) this.totalBytes / FileUtils.ONE_GB)); log.info("Total process reasoning time (serialization in inf file) = " + stopwatch1); log.info("Total time spent in empty inference cycles = " + stopwatch2); }
From source file:io.ecarf.core.cloud.task.processor.reason.phase2.DoReasonTask6.java
@Override public void run() throws IOException { GoogleCloudService cloud = (GoogleCloudService) this.getCloudService(); //String table = metadata.getValue(EcarfMetaData.ECARF_TABLE); //Set<String> terms = metadata.getTerms(); //String schemaFile = metadata.getValue(EcarfMetaData.ECARF_SCHEMA); //String bucket = metadata.getBucket(); Stopwatch stopwatch1 = Stopwatch.createUnstarted(); Stopwatch stopwatch2 = Stopwatch.createUnstarted(); Set<String> termsSet; if (terms == null) { // too large, probably saved as a file //String termsFile = metadata.getValue(EcarfMetaData.ECARF_TERMS_FILE); log.info("Using json file for terms: " + termsFile); Validate.notNull(termsFile);//from w w w . j a va 2s .c o m String localTermsFile = Utils.TEMP_FOLDER + termsFile; cloud.downloadObjectFromCloudStorage(termsFile, localTermsFile, bucket); // convert from JSON termsSet = io.cloudex.framework.utils.FileUtils.jsonFileToSet(localTermsFile); } else { termsSet = ObjectUtils.csvToSet(terms); } String localSchemaFile = Utils.TEMP_FOLDER + schemaFile; // download the file from the cloud storage cloud.downloadObjectFromCloudStorage(schemaFile, localSchemaFile, bucket); // uncompress if compressed if (GzipUtils.isCompressedFilename(schemaFile)) { localSchemaFile = GzipUtils.getUncompressedFilename(localSchemaFile); } Map<String, Set<Triple>> allSchemaTriples = TripleUtils.getRelevantSchemaNTriples(localSchemaFile, TermUtils.RDFS_TBOX); // get all the triples we care about schemaTerms = new HashMap<>(); for (String term : termsSet) { if (allSchemaTriples.containsKey(term)) { schemaTerms.put(term, allSchemaTriples.get(term)); } } String decoratedTable = table; int emptyRetries = 0; int totalInferredTriples = 0; int maxRetries = Config.getIntegerProperty(Constants.REASON_RETRY_KEY, 6); String instanceId = cloud.getInstanceId(); QueryGenerator<String> generator = new QueryGenerator<String>(schemaTerms, null); // timestamp loop do { Set<String> productiveTerms = new HashSet<>(); int interimInferredTriples = 0; // First of all run all the queries asynchronously and remember the jobId and filename for each term List<QueryResult> queryResults = new ArrayList<QueryResult>(); generator.setDecoratedTable(decoratedTable); List<String> queries = generator.getQueries(); log.debug("Generated Queries: " + queries); String queryResultFilePrefix = Utils.TEMP_FOLDER + instanceId + '_' + System.currentTimeMillis() + "_QueryResults_"; int fileCount = 0; for (String query : queries) { String jobId = cloud.startBigDataQuery(query); queryResults .add(QueryResult.create().setFilename(queryResultFilePrefix + fileCount).setJobId(jobId)); fileCount++; } // invoke all the queries in parallel //this.invokeAll(queryTasks); long start = System.currentTimeMillis(); String inferredTriplesFile = Utils.TEMP_FOLDER + instanceId + '_' + start + Constants.DOT_INF; // save all the query results in files in parallel //this.invokeAll(saveTasks); for (QueryResult queryResult : queryResults) { try { // block and wait for each job to complete then save results to a file QueryStats stats = cloud.saveBigQueryResultsToFile(queryResult.getJobId(), queryResult.getFilename()); queryResult.setStats(stats); } catch (IOException ioe) { // transient backend errors log.warn("failed to save query results to file, jobId: " + queryResult.getJobId(), ioe); //TODO should throw an exception } } try (PrintWriter writer = new PrintWriter( new GZIPOutputStream(new FileOutputStream(inferredTriplesFile), Constants.GZIP_BUF_SIZE))) { // now loop through the queries //for(Entry<Term, Set<Triple>> entry: schemaTerms.entrySet()) { for (QueryResult queryResult : queryResults) { //Term term = entry.getKey(); QueryStats stats = queryResult.getStats(); BigInteger rows = stats.getTotalRows();//term.getRows(); this.totalBytes = this.totalBytes + stats.getTotalProcessedBytes();//term.getBytes(); // only process if triples are found matching this term if (!BigInteger.ZERO.equals(rows)) { stopwatch1.start(); int inferredTriplesCount = this.inferAndSaveTriplesToFile(queryResult, productiveTerms, decoratedTable, writer); interimInferredTriples += inferredTriplesCount; this.totalRows = this.totalRows.add(rows); stopwatch1.stop(); } else { log.info("Skipping query as no data is found"); } } } totalInferredTriples += interimInferredTriples; if (interimInferredTriples > 0) { // stream smaller numbers of inferred triples // try uploading from cloud storage int streamingThreshold = Config.getIntegerProperty("ecarf.io.reasoning.streaming.threshold", 100000); log.info("Inserting " + interimInferredTriples + ", inferred triples into Big Data table for " + productiveTerms.size() + " productive terms. Filename: " + inferredTriplesFile); if (interimInferredTriples <= streamingThreshold) { // stream the data Set<Triple> inferredTriples = TripleUtils.loadCompressedCSVTriples(inferredTriplesFile, false); log.info("Total triples to stream into Big Data: " + inferredTriples.size()); cloud.streamObjectsIntoBigData(inferredTriples, TableUtils.getBigQueryTripleTable(table)); log.info("All inferred triples are streamed into Big Data table"); } else { // load the data through cloud storage // upload the file to cloud storage log.info("Uploading inferred triples file into cloud storage: " + inferredTriplesFile); StorageObject file = cloud.uploadFileToCloudStorage(inferredTriplesFile, bucket); log.info("File " + file + ", uploaded successfully. Now loading it into big data."); String jobId = cloud.loadCloudStorageFilesIntoBigData(Lists.newArrayList(file.getUri()), TableUtils.getBigQueryTripleTable(table), false); log.info( "All inferred triples are loaded into Big Data table through cloud storage, completed jobId: " + jobId); } // reset empty retries emptyRetries = 0; stopwatch2.reset(); } else { log.info("No new inferred triples"); // increment empty retries emptyRetries++; if (!stopwatch2.isRunning()) { stopwatch2.start(); } } log.info("Total inferred triples so far = " + totalInferredTriples + ", current retry count: " + emptyRetries); if (emptyRetries < maxRetries) { ApiUtils.block(Config.getIntegerProperty(Constants.REASON_SLEEP_KEY, 20)); // FIXME move into the particular cloud implementation service long elapsed = System.currentTimeMillis() - start; decoratedTable = "[" + table + "@-" + elapsed + "-]"; log.info("Using table decorator: " + decoratedTable + ". Empty retries count: " + emptyRetries); } } while (emptyRetries < maxRetries); // end timestamp loop //executor.shutdown(); log.info("Finished reasoning, total inferred triples = " + totalInferredTriples); log.info("Number of avoided duplicate terms = " + this.duplicates); log.info("Total rows retrieved from big data = " + this.totalRows); log.info("Total processed GBytes = " + ((double) this.totalBytes / FileUtils.ONE_GB)); log.info("Total process reasoning time (serialization in inf file) = " + stopwatch1); log.info("Total time spent in empty inference cycles = " + stopwatch2); }
From source file:io.ecarf.core.cloud.task.processor.reason.phase2.DoReasonTask8.java
@Override public void run() throws IOException { GoogleCloudService cloud = (GoogleCloudService) this.getCloudService(); Stopwatch stopwatch1 = Stopwatch.createUnstarted(); Stopwatch stopwatch2 = Stopwatch.createUnstarted(); Set<String> termsSet; if (terms == null) { // too large, probably saved as a file log.info("Using json file for terms: " + termsFile); Validate.notNull(termsFile);/* w ww.ja v a 2 s .com*/ String localTermsFile = Utils.TEMP_FOLDER + termsFile; cloud.downloadObjectFromCloudStorage(termsFile, localTermsFile, bucket); // convert from JSON termsSet = io.cloudex.framework.utils.FileUtils.jsonFileToSet(localTermsFile); } else { termsSet = ObjectUtils.csvToSet(terms); } String localSchemaFile = Utils.TEMP_FOLDER + schemaFile; // download the file from the cloud storage cloud.downloadObjectFromCloudStorage(schemaFile, localSchemaFile, bucket); // uncompress if compressed if (GzipUtils.isCompressedFilename(schemaFile)) { localSchemaFile = GzipUtils.getUncompressedFilename(localSchemaFile); } Map<Long, Set<Triple>> allSchemaTriples = TripleUtils.getRelevantSchemaETriples(localSchemaFile, TermUtils.RDFS_TBOX); // get all the triples we care about schemaTerms = new HashMap<>(); for (String termStr : termsSet) { Long term = Long.parseLong(termStr); if (allSchemaTriples.containsKey(term)) { schemaTerms.put(term, allSchemaTriples.get(term)); } } String decoratedTable = table; int emptyRetries = 0; int totalInferredTriples = 0; int maxRetries = Config.getIntegerProperty(Constants.REASON_RETRY_KEY, 6); this.ddLimit = Config.getIntegerProperty(Constants.REASON_DATA_DIRECT_DOWNLOAD_LIMIT, 1_200_000); String instanceId = cloud.getInstanceId(); QueryGenerator<Long> generator = new QueryGenerator<Long>(schemaTerms, null); // timestamp loop do { Set<Long> productiveTerms = new HashSet<>(); int interimInferredTriples = 0; // First of all run all the queries asynchronously and remember the jobId and filename for each term List<QueryResult> queryResults = new ArrayList<QueryResult>(); generator.setDecoratedTable(decoratedTable); List<String> queries = generator.getQueries(); log.debug("Generated Queries: " + queries); String queryResultFilePrefix = instanceId + '_' + System.currentTimeMillis() + "_QueryResults_"; int fileCount = 0; for (String query : queries) { String jobId = cloud.startBigDataQuery(query, new BigDataTable(this.table)); queryResults .add(QueryResult.create().setFilename(queryResultFilePrefix + fileCount).setJobId(jobId)); fileCount++; } // invoke all the queries in parallel //this.invokeAll(queryTasks); long start = System.currentTimeMillis(); String inferredTriplesFile = Utils.TEMP_FOLDER + instanceId + '_' + start + Constants.DOT_INF; // save all the query results in files in parallel //this.invokeAll(saveTasks); for (QueryResult queryResult : queryResults) { try { // block and wait for each job to complete then save results to a file QueryStats stats = cloud.saveBigQueryResultsToFile(queryResult.getJobId(), queryResult.getFilename(), this.bucket, null, this.ddLimit); queryResult.setStats(stats); } catch (IOException ioe) { log.error("failed to save query results to file, jobId: " + queryResult.getJobId(), ioe); throw ioe; } } try (PrintWriter writer = new PrintWriter( new GZIPOutputStream(new FileOutputStream(inferredTriplesFile), Constants.GZIP_BUF_SIZE))) { // now loop through the queries //for(Entry<Term, Set<Triple>> entry: schemaTerms.entrySet()) { for (QueryResult queryResult : queryResults) { //Term term = entry.getKey(); QueryStats stats = queryResult.getStats(); BigInteger rows = stats.getTotalRows();//term.getRows(); this.totalBytes = this.totalBytes + stats.getTotalProcessedBytes();//term.getBytes(); // only process if triples are found matching this term if (!BigInteger.ZERO.equals(rows)) { stopwatch1.start(); int inferredTriplesCount = this.inferAndSaveTriplesToFile(queryResult, productiveTerms, writer); interimInferredTriples += inferredTriplesCount; this.totalRows = this.totalRows.add(rows); stopwatch1.stop(); } else { log.info("Skipping query as no data is found"); } } } totalInferredTriples += interimInferredTriples; if (interimInferredTriples > 0) { // stream smaller numbers of inferred triples // try uploading from cloud storage int streamingThreshold = Config.getIntegerProperty("ecarf.io.reasoning.streaming.threshold", 100000); log.info("Inserting " + interimInferredTriples + ", inferred triples into Big Data table for " + productiveTerms.size() + " productive terms. Filename: " + inferredTriplesFile); if (interimInferredTriples <= streamingThreshold) { // stream the data Set<Triple> inferredTriples = TripleUtils.loadCompressedCSVTriples(inferredTriplesFile, true); log.info("Total triples to stream into Big Data: " + inferredTriples.size()); cloud.streamObjectsIntoBigData(inferredTriples, TableUtils.getBigQueryEncodedTripleTable(table)); log.info("All inferred triples are streamed into Big Data table"); } else { // load the data through cloud storage // upload the file to cloud storage log.info("Uploading inferred triples file into cloud storage: " + inferredTriplesFile); StorageObject file = cloud.uploadFileToCloudStorage(inferredTriplesFile, bucket); log.info("File " + file + ", uploaded successfully. Now loading it into big data."); String jobId = cloud.loadCloudStorageFilesIntoBigData(Lists.newArrayList(file.getUri()), TableUtils.getBigQueryEncodedTripleTable(table), false); log.info( "All inferred triples are loaded into Big Data table through cloud storage, completed jobId: " + jobId); } // reset empty retries emptyRetries = 0; stopwatch2.reset(); } else { log.info("No new inferred triples"); // increment empty retries emptyRetries++; if (!stopwatch2.isRunning()) { stopwatch2.start(); } } log.info("Total inferred triples so far = " + totalInferredTriples + ", current retry count: " + emptyRetries); if (emptyRetries < maxRetries) { ApiUtils.block(Config.getIntegerProperty(Constants.REASON_SLEEP_KEY, 20)); // FIXME move into the particular cloud implementation service long elapsed = System.currentTimeMillis() - start; decoratedTable = "[" + table + "@-" + elapsed + "-]"; log.info("Using table decorator: " + decoratedTable + ". Empty retries count: " + emptyRetries); } } while (emptyRetries < maxRetries); // end timestamp loop //executor.shutdown(); log.info("Finished reasoning, total inferred triples = " + totalInferredTriples); //log.info("Number of avoided duplicate terms = " + this.duplicates); log.info("Total rows retrieved from big data = " + this.totalRows); log.info("Total processed GBytes = " + ((double) this.totalBytes / FileUtils.ONE_GB)); log.info("Total process reasoning time (serialization in inf file) = " + stopwatch1); log.info("Total time spent in empty inference cycles = " + stopwatch2); }
From source file:com.sunchenbin.store.feilong.core.io.FileUtil.java
/** * ??.// w ww .j a v a 2 s . c o m * * <p> * ???? GB MB KB???? Bytes * </p> * * <p> * Common-io 2.4{@link org.apache.commons.io.FileUtils#byteCountToDisplaySize(long)},GB ??1.99G 1G,apache ? * </p> * * @param fileSize * ? ??byte * @return ?byte ? * @see #getFileSize(File) * @see org.apache.commons.io.FileUtils#ONE_GB * @see org.apache.commons.io.FileUtils#ONE_MB * @see org.apache.commons.io.FileUtils#ONE_KB * * @see org.apache.commons.io.FileUtils#byteCountToDisplaySize(long) */ public static String formatSize(long fileSize) { String danwei = ""; long chushu = 1;// if (fileSize >= FileUtils.ONE_GB) { danwei = "GB"; chushu = FileUtils.ONE_GB; } else if (fileSize >= FileUtils.ONE_MB) { danwei = "MB"; chushu = FileUtils.ONE_MB; } else if (fileSize >= FileUtils.ONE_KB) { danwei = "KB"; chushu = FileUtils.ONE_KB; } else { return fileSize + "Bytes"; } String yushu = 100 * (fileSize % chushu) / chushu + ""; // ? if ("0".equals(yushu)) { return fileSize / chushu + danwei; } return fileSize / chushu + "." + yushu + danwei; }
From source file:org.apache.eagle.jpm.analyzer.mr.suggestion.MapReduceTaskNumProcessor.java
private String analyzeReduceTaskNum(List<String> optSettings) { StringBuilder sb = new StringBuilder(); long numReduces = context.getNumReduces(); if (numReduces > 0) { long avgReduceTime = context.getAvgReduceTimeInSec(); long avgShuffleTime = context.getAvgShuffleTimeInSec(); long avgShuffleBytes = context.getJob().getReduceCounters() .getCounterValue(JobCounters.CounterName.REDUCE_SHUFFLE_BYTES) / numReduces; long avgReduceOutput = context.getJob().getReduceCounters() .getCounterValue(JobCounters.CounterName.HDFS_BYTES_WRITTEN) / numReduces; long avgReduceTotalTime = avgShuffleTime + avgReduceTime; long suggestReduces = 0; StringBuilder tmpsb = new StringBuilder(); String avgShuffleDisplaySize = bytesToHumanReadable(avgShuffleBytes); if (avgShuffleBytes < 256 * FileUtils.ONE_MB && avgReduceTotalTime < 300 && avgReduceOutput < 256 * FileUtils.ONE_MB && numReduces > 1) { tmpsb.append("average reduce input bytes is: ").append(avgShuffleDisplaySize).append(", "); suggestReduces = getReduceNum(avgShuffleBytes, avgReduceOutput, avgReduceTime); } else if (avgShuffleBytes > 10 * FileUtils.ONE_GB && avgReduceTotalTime > 1800) { tmpsb.append("average reduce input bytes is: ").append(avgShuffleDisplaySize).append(", "); suggestReduces = getReduceNum(avgShuffleBytes, avgReduceOutput, avgReduceTime); }//from w w w. j a v a2 s . c o m if (avgReduceTotalTime < 60 && numReduces > 1) { tmpsb.append("average reduce time is only ").append(avgReduceTotalTime).append(" seconds, "); if (suggestReduces == 0) { suggestReduces = getReduceNum(avgShuffleBytes, avgReduceOutput, avgReduceTime); } } else if (avgReduceTotalTime > 3600 && avgReduceTime > 1800) { tmpsb.append("average reduce time is ").append(avgReduceTotalTime).append(" seconds, "); if (suggestReduces == 0) { suggestReduces = getReduceNum(avgShuffleBytes, avgReduceOutput, avgReduceTime); } } String avgReduceOutputDisplaySize = bytesToHumanReadable(avgReduceOutput); if (avgReduceOutput < 10 * FileUtils.ONE_MB && avgReduceTime < 300 && avgShuffleBytes < 2 * FileUtils.ONE_GB && numReduces > 1) { tmpsb.append(" average reduce output is only ").append(avgReduceOutputDisplaySize).append(", "); if (suggestReduces == 0) { suggestReduces = getReduceNum(avgShuffleBytes, avgReduceOutput, avgReduceTime); } } else if (avgReduceOutput > 10 * FileUtils.ONE_GB && avgReduceTime > 1800) { tmpsb.append(" average reduce output is ").append(avgReduceOutputDisplaySize).append(", "); if (suggestReduces == 0) { suggestReduces = getReduceNum(avgShuffleBytes, avgReduceOutput, avgReduceTime); } } if (suggestReduces > 0) { sb.append("Best practice: ").append(tmpsb.toString()).append("please consider "); if (suggestReduces > numReduces) { sb.append("increasing the "); } else { sb.append("decreasing the "); } String setting = String.format("-D%s=%s", NUM_REDUCES, suggestReduces); sb.append("reducer number. You could try ").append(setting).append("\n"); optSettings.add(setting); } } return sb.toString(); }
From source file:org.apache.eagle.jpm.analyzer.mr.suggestion.MapReduceTaskNumProcessor.java
private String analyzeMapTaskNum(List<String> optSettings) { StringBuilder sb = new StringBuilder(); long numMaps = context.getNumMaps(); long avgMapTime = context.getAvgMapTimeInSec(); long avgMapInput = context.getJob().getMapCounters() .getCounterValue(JobCounters.CounterName.HDFS_BYTES_READ) / numMaps; String avgMapInputDisplaySize = bytesToHumanReadable(avgMapInput); if (avgMapInput < 5 * FileUtils.ONE_MB && avgMapTime < 30 && numMaps > 1) { sb.append("Best practice: average map input bytes only have ").append(avgMapInputDisplaySize); sb.append(". Please reduce the number of mappers by merging input files.\n"); } else if (avgMapInput > FileUtils.ONE_GB) { sb.append("Best practice: average map input bytes have ").append(avgMapInputDisplaySize); sb.append(//w w w . j a v a 2 s. c o m ". Please increase the number of mappers by using splittable compression, a container file format or a smaller block size.\n"); } if (avgMapTime < 10 && numMaps > 1) { sb.append("Best practice: average map time only have ").append(avgMapTime); sb.append( " seconds. Please reduce the number of mappers by merging input files or by using a larger block size.\n"); } else if (avgMapTime > 600 && avgMapInput < FileUtils.ONE_GB) { sb.append("Best practice: average map time is ").append(avgMapInput); sb.append( " seconds. Please increase the number of mappers by using splittable compression, a container file format or a smaller block size.\n"); } return sb.toString(); }
From source file:org.apache.eagle.jpm.analyzer.mr.suggestion.MapReduceTaskNumProcessor.java
private long getReduceNum(long avgInputBytes, long avgOutputBytes, long avgTime) { long newReduceNum = 1; long tmpReduceNum; long numReduces = context.getNumReduces(); tmpReduceNum = avgInputBytes * numReduces / (3 * FileUtils.ONE_GB); if (tmpReduceNum > newReduceNum) { newReduceNum = tmpReduceNum;//from w w w. j a v a2 s. co m } tmpReduceNum = avgOutputBytes * numReduces / (2 * FileUtils.ONE_GB); if (tmpReduceNum > newReduceNum) { newReduceNum = tmpReduceNum; } tmpReduceNum = avgTime * numReduces / (10 * 60); if (tmpReduceNum > newReduceNum) { newReduceNum = tmpReduceNum; } return newReduceNum; }
From source file:org.apache.usergrid.services.assets.data.S3BinaryStore.java
@Override public void write(final UUID appId, final Entity entity, InputStream inputStream) throws IOException { String uploadFileName = AssetUtils.buildAssetKey(appId, entity); ByteArrayOutputStream baos = new ByteArrayOutputStream(); long written = IOUtils.copyLarge(inputStream, baos, 0, FIVE_MB); byte[] data = baos.toByteArray(); final Map<String, Object> fileMetadata = AssetUtils.getFileMetadata(entity); fileMetadata.put(AssetUtils.LAST_MODIFIED, System.currentTimeMillis()); String mimeType = AssetMimeHandler.get().getMimeType(entity, data); if (written < FIVE_MB) { // total smaller than 5mb BlobStore blobStore = getContext().getBlobStore(); BlobBuilder.PayloadBlobBuilder bb = blobStore.blobBuilder(uploadFileName).payload(data).calculateMD5() .contentType(mimeType);/* w w w . j a va 2 s.c o m*/ fileMetadata.put(AssetUtils.CONTENT_LENGTH, written); if (fileMetadata.get(AssetUtils.CONTENT_DISPOSITION) != null) { bb.contentDisposition(fileMetadata.get(AssetUtils.CONTENT_DISPOSITION).toString()); } final Blob blob = bb.build(); String md5sum = Hex.encodeHexString(blob.getMetadata().getContentMetadata().getContentMD5()); fileMetadata.put(AssetUtils.CHECKSUM, md5sum); String eTag = blobStore.putBlob(bucketName, blob); fileMetadata.put(AssetUtils.E_TAG, eTag); } else { // bigger than 5mb... dump 5 mb tmp files and upload from them // todo: yes, AsyncBlobStore is deprecated, but there appears to be no replacement yet final AsyncBlobStore blobStore = getContext().getAsyncBlobStore(); File tempFile = File.createTempFile(entity.getUuid().toString(), "tmp"); tempFile.deleteOnExit(); OutputStream os = null; try { os = new BufferedOutputStream(new FileOutputStream(tempFile.getAbsolutePath())); os.write(data); written += IOUtils.copyLarge(inputStream, os, 0, (FileUtils.ONE_GB * 5)); } finally { IOUtils.closeQuietly(os); } BlobBuilder.PayloadBlobBuilder bb = blobStore.blobBuilder(uploadFileName).payload(tempFile) .calculateMD5().contentType(mimeType); fileMetadata.put(AssetUtils.CONTENT_LENGTH, written); if (fileMetadata.get(AssetUtils.CONTENT_DISPOSITION) != null) { bb.contentDisposition(fileMetadata.get(AssetUtils.CONTENT_DISPOSITION).toString()); } final Blob blob = bb.build(); final File finalTempFile = tempFile; final ListenableFuture<String> future = blobStore.putBlob(bucketName, blob, PutOptions.Builder.multipart()); Runnable listener = new Runnable() { @Override public void run() { try { String eTag = future.get(); fileMetadata.put(AssetUtils.E_TAG, eTag); EntityManager em = emf.getEntityManager(appId); em.update(entity); finalTempFile.delete(); } catch (Exception e) { LOG.error("error uploading", e); } if (finalTempFile != null && finalTempFile.exists()) { finalTempFile.delete(); } } }; future.addListener(listener, executor); } }