List of usage examples for org.apache.commons.compress.compressors.gzip GzipUtils isCompressedFilename
public static boolean isCompressedFilename(String filename)
From source file:com.zenome.bundlebus.Util.java
public static boolean unZip(@NonNull final File aGzFile, @NonNull final File aOutputFile) throws FileNotFoundException, IOException { Log.d(TAG, "gz filename : " + aGzFile); Log.d(TAG, "output file : " + aOutputFile); if (!GzipUtils.isCompressedFilename(aGzFile.getAbsolutePath())) { Log.d(TAG, "This file is not compressed file : " + aGzFile.getAbsolutePath()); return false; }/*from w ww. j a va 2s.c om*/ final FileInputStream fis = new FileInputStream(aGzFile); BufferedInputStream in = new BufferedInputStream(fis); FileOutputStream out = new FileOutputStream(aOutputFile); GzipCompressorInputStream gzIn = new GzipCompressorInputStream(in); final byte[] buffer = new byte[4096]; int n = 0; while (-1 != (n = gzIn.read(buffer))) { out.write(buffer, 0, n); } out.close(); gzIn.close(); return true; }
From source file:net.sf.util.zip.FileNameUtil.java
/** * * @param name the file name//from w ww . j a v a2 s. co m * @return whether the file is an valid compressed file or not (based on suffix) */ public static boolean isCompressedFile(String name) { if (BZip2Utils.isCompressedFilename(name) || GzipUtils.isCompressedFilename(name) || XZUtils.isCompressedFilename(name)) return true; return false; }
From source file:io.ecarf.core.cloud.task.coordinator.CountSchemaTermTask.java
@Override public void run() throws IOException { log.info("Processing schema terms"); //String schemaFile = metadata.getValue(EcarfMetaData.ECARF_SCHEMA); //String bucket = metadata.getBucket(); // check if we have a source bucket for backward compatibility if (StringUtils.isBlank(sourceBucket)) { log.warn("sourceBucket is empty, using bucket: " + bucket); this.sourceBucket = bucket; }/* w ww .j a va 2 s . c o m*/ String localSchemaFile = Utils.TEMP_FOLDER + schemaFile; // download the file from the cloud storage this.getCloudService().downloadObjectFromCloudStorage(schemaFile, localSchemaFile, sourceBucket); // uncompress if compressed if (GzipUtils.isCompressedFilename(schemaFile)) { localSchemaFile = GzipUtils.getUncompressedFilename(localSchemaFile); } Set<String> terms = TermUtils.getRelevantSchemaTerms(localSchemaFile, TermUtils.RDFS_TBOX); log.info("Total relevant schema terms: " + terms.size()); String schemaTermsFile = Utils.TEMP_FOLDER + Constants.SCHEMA_TERMS_JSON; // save to file FileUtils.objectToJsonFile(schemaTermsFile, terms); // upload the file to cloud storage this.getCloudService().uploadFileToCloudStorage(schemaTermsFile, bucket); // add value to the output this.addOutput("schemaTermsFile", Constants.SCHEMA_TERMS_JSON); log.info("Successfully processed schema terms"); }
From source file:net.sf.util.zip.FileNameUtil.java
/** * * @param fileName the file name//from www.j a v a 2 s .com * @return Compressed file type, as defined in CompressorStreamFactory */ public static String[] getCompressFileType(String fileName) { String s = fileName.toLowerCase(); String[] ret = { null, null }; if (GzipUtils.isCompressedFilename(s)) { ret[0] = CompressorStreamFactory.GZIP; ret[1] = GzipUtils.getUncompressedFilename(fileName); } else if (BZip2Utils.isCompressedFilename(s)) { ret[0] = CompressorStreamFactory.BZIP2; ret[1] = BZip2Utils.getUncompressedFilename(fileName); } else if (XZUtils.isCompressedFilename(s)) { ret[0] = CompressorStreamFactory.XZ; ret[1] = XZUtils.getUncompressedFilename(fileName); } return ret; }
From source file:io.ecarf.core.cloud.task.processor.reason.phase2.DoReasonTask9.java
/** * Carryout the setup of the schema terms * @param cloud// www . j a v a 2s . c o m * @throws IOException */ private void setup(GoogleCloudService cloud) throws IOException { Set<String> termsSet; if (terms == null) { // too large, probably saved as a file log.info("Using json file for terms: " + termsFile); Validate.notNull(termsFile); String localTermsFile = Utils.TEMP_FOLDER + termsFile; cloud.downloadObjectFromCloudStorage(termsFile, localTermsFile, bucket); // convert from JSON termsSet = io.cloudex.framework.utils.FileUtils.jsonFileToSet(localTermsFile); } else { termsSet = ObjectUtils.csvToSet(terms); } String localSchemaFile = Utils.TEMP_FOLDER + schemaFile; // download the file from the cloud storage cloud.downloadObjectFromCloudStorage(schemaFile, localSchemaFile, bucket); // uncompress if compressed if (GzipUtils.isCompressedFilename(schemaFile)) { localSchemaFile = GzipUtils.getUncompressedFilename(localSchemaFile); } Map<Long, Set<Triple>> allSchemaTriples = TripleUtils.getRelevantSchemaETriples(localSchemaFile, TermUtils.RDFS_TBOX); // get all the triples we care about schemaTerms = new HashMap<>(); for (String termStr : termsSet) { Long term = Long.parseLong(termStr); if (allSchemaTriples.containsKey(term)) { schemaTerms.put(term, allSchemaTriples.get(term)); } } }
From source file:io.ecarf.core.cloud.task.processor.reason.phase0.DoReasonTask3.java
@Override public void run() throws IOException { GoogleCloudService cloud = (GoogleCloudService) this.getCloudService(); //String table = metadata.getValue(EcarfMetaData.ECARF_TABLE); //Set<String> terms = metadata.getTerms(); //String schemaFile = metadata.getValue(EcarfMetaData.ECARF_SCHEMA); //String bucket = metadata.getBucket(); Set<String> termsSet; if (terms == null) { // too large, probably saved as a file //String termsFile = metadata.getValue(EcarfMetaData.ECARF_TERMS_FILE); log.info("Using json file for terms: " + termsFile); Validate.notNull(termsFile);//from ww w .j a v a 2 s . c o m String localTermsFile = Utils.TEMP_FOLDER + termsFile; cloud.downloadObjectFromCloudStorage(termsFile, localTermsFile, bucket); // convert from JSON termsSet = FileUtils.jsonFileToSet(localTermsFile); } else { termsSet = ObjectUtils.csvToSet(terms); } String localSchemaFile = Utils.TEMP_FOLDER + schemaFile; // download the file from the cloud storage cloud.downloadObjectFromCloudStorage(schemaFile, localSchemaFile, bucket); // uncompress if compressed if (GzipUtils.isCompressedFilename(schemaFile)) { localSchemaFile = GzipUtils.getUncompressedFilename(localSchemaFile); } Map<String, Set<Triple>> allSchemaTriples = TripleUtils.getRelevantSchemaNTriples(localSchemaFile, TermUtils.RDFS_TBOX); // get all the triples we care about Map<Term, Set<Triple>> schemaTerms = new HashMap<>(); for (String term : termsSet) { if (allSchemaTriples.containsKey(term)) { schemaTerms.put(new Term(term), allSchemaTriples.get(term)); } } String decoratedTable = table; int emptyRetries = 0; int totalInferredTriples = 0; int maxRetries = Config.getIntegerProperty(Constants.REASON_RETRY_KEY, 6); // timestamp loop do { //List<String> inferredFiles = new ArrayList<>(); // First of all run all the queries asynchronously and remember the jobId and filename for each term for (Entry<Term, Set<Triple>> entry : schemaTerms.entrySet()) { Term term = entry.getKey(); // add table decoration to table name String query = GenericRule.getQuery(entry.getValue(), decoratedTable); log.info("\nQuery: " + query); String jobId = cloud.startBigDataQuery(query); String encodedTerm = FileUtils.encodeFilename(term.getTerm()); String filename = Utils.TEMP_FOLDER + encodedTerm + Constants.DOT_TERMS; // remember the filename and the jobId for this query term.setFilename(filename).setJobId(jobId).setEncodedTerm(encodedTerm); } long start = System.currentTimeMillis(); String inferredTriplesFile = Utils.TEMP_FOLDER + start + Constants.DOT_INF; List<String> productiveTerms = new ArrayList<>(); int interimInferredTriples = 0; try (PrintWriter writer = new PrintWriter( new GZIPOutputStream(new FileOutputStream(inferredTriplesFile), Constants.GZIP_BUF_SIZE))) { // now loop through the queries for (Entry<Term, Set<Triple>> entry : schemaTerms.entrySet()) { Term term = entry.getKey(); log.info("Reasoning for Term: " + term); Set<Triple> schemaTriples = entry.getValue(); log.info("Schema Triples: " + Joiner.on('\n').join(schemaTriples)); List<String> select = GenericRule.getSelect(schemaTriples); // block and wait for each job to complete then save results to a file BigInteger rows = BigInteger.ZERO; try { rows = cloud.saveBigQueryResultsToFile(term.getJobId(), term.getFilename()).getTotalRows(); } catch (IOException ioe) { // transient backend errors log.warn("failed to save query results to file, jobId: " + term.getJobId()); } log.info("Query found " + rows + ", rows"); // only process if triples are found matching this term if (!BigInteger.ZERO.equals(rows)) { int inferredTriplesCount = this.inferAndSaveTriplesToFile(term, select, schemaTriples, rows, decoratedTable, writer); productiveTerms.add(term.getTerm()); interimInferredTriples += inferredTriplesCount; } } } totalInferredTriples += interimInferredTriples; if (interimInferredTriples > 0) { //TODO stream smaller numbers of inferred triples //TODO try uploading from cloud storage int streamingThreshold = Config.getIntegerProperty("ecarf.io.reasoning.streaming.threshold", 100000); log.info("Inserting " + interimInferredTriples + ", inferred triples into Big Data table for " + productiveTerms.size() + " productive terms. Filename: " + inferredTriplesFile); if (interimInferredTriples <= streamingThreshold) { // stream the data Set<Triple> inferredTriples = TripleUtils.loadCompressedCSVTriples(inferredTriplesFile, false); log.info("Total triples to stream into Big Data: " + inferredTriples.size()); cloud.streamObjectsIntoBigData(inferredTriples, TableUtils.getBigQueryTripleTable(table)); log.info("All inferred triples are streamed into Big Data table"); } else { // directly upload the data List<String> jobIds = cloud.loadLocalFilesIntoBigData(Lists.newArrayList(inferredTriplesFile), TableUtils.getBigQueryTripleTable(table), false); log.info("All inferred triples are directly loaded into Big Data table, completed jobIds: " + jobIds); } // reset empty retries emptyRetries = 0; } else { log.info("No new inferred triples"); // increment empty retries emptyRetries++; } log.info("Total inferred triples so far = " + totalInferredTriples + ", current retry count: " + emptyRetries); ApiUtils.block(Config.getIntegerProperty(Constants.REASON_SLEEP_KEY, 20)); // FIXME move into the particular cloud implementation service long elapsed = System.currentTimeMillis() - start; decoratedTable = "[" + table + "@-" + elapsed + "-]"; log.info("Using table decorator: " + decoratedTable + ". Empty retries count: " + emptyRetries); } while (!(emptyRetries == maxRetries)); // end timestamp loop log.info("Finished reasoning, total inferred triples = " + totalInferredTriples); }
From source file:io.ecarf.core.cloud.task.processor.reason.phase0.DoReasonTask4.java
@Override public void run() throws IOException { GoogleCloudService cloud = (GoogleCloudService) this.getCloudService(); //String table = metadata.getValue(EcarfMetaData.ECARF_TABLE); //Set<String> terms = metadata.getTerms(); //String schemaFile = metadata.getValue(EcarfMetaData.ECARF_SCHEMA); //String bucket = metadata.getBucket(); Set<String> termsSet; if (terms == null) { // too large, probably saved as a file //String termsFile = metadata.getValue(EcarfMetaData.ECARF_TERMS_FILE); log.info("Using json file for terms: " + termsFile); Validate.notNull(termsFile);// w w w .ja va2 s .co m String localTermsFile = Utils.TEMP_FOLDER + termsFile; cloud.downloadObjectFromCloudStorage(termsFile, localTermsFile, bucket); // convert from JSON termsSet = FileUtils.jsonFileToSet(localTermsFile); } else { termsSet = ObjectUtils.csvToSet(terms); } String localSchemaFile = Utils.TEMP_FOLDER + schemaFile; // download the file from the cloud storage cloud.downloadObjectFromCloudStorage(schemaFile, localSchemaFile, bucket); // uncompress if compressed if (GzipUtils.isCompressedFilename(schemaFile)) { localSchemaFile = GzipUtils.getUncompressedFilename(localSchemaFile); } Map<String, Set<Triple>> allSchemaTriples = TripleUtils.getRelevantSchemaNTriples(localSchemaFile, TermUtils.RDFS_TBOX); // get all the triples we care about Map<Term, Set<Triple>> schemaTerms = new HashMap<>(); for (String term : termsSet) { if (allSchemaTriples.containsKey(term)) { schemaTerms.put(new Term(term), allSchemaTriples.get(term)); } } String decoratedTable = table; int emptyRetries = 0; int totalInferredTriples = 0; int maxRetries = Config.getIntegerProperty(Constants.REASON_RETRY_KEY, 6); String instanceId = cloud.getInstanceId(); // timestamp loop do { //List<String> inferredFiles = new ArrayList<>(); // First of all run all the queries asynchronously and remember the jobId and filename for each term for (Entry<Term, Set<Triple>> entry : schemaTerms.entrySet()) { Term term = entry.getKey(); // add table decoration to table name String query = GenericRule.getQuery(entry.getValue(), decoratedTable); log.info("\nQuery: " + query); String jobId = cloud.startBigDataQuery(query); String encodedTerm = FileUtils.encodeFilename(term.getTerm()); String filename = Utils.TEMP_FOLDER + encodedTerm + Constants.DOT_TERMS; // remember the filename and the jobId for this query term.setFilename(filename).setJobId(jobId).setEncodedTerm(encodedTerm); } long start = System.currentTimeMillis(); String inferredTriplesFile = Utils.TEMP_FOLDER + instanceId + '_' + start + Constants.DOT_INF; List<String> productiveTerms = new ArrayList<>(); int interimInferredTriples = 0; try (PrintWriter writer = new PrintWriter( new GZIPOutputStream(new FileOutputStream(inferredTriplesFile), Constants.GZIP_BUF_SIZE))) { // now loop through the queries for (Entry<Term, Set<Triple>> entry : schemaTerms.entrySet()) { Term term = entry.getKey(); log.info("Reasoning for Term: " + term); Set<Triple> schemaTriples = entry.getValue(); log.info("Schema Triples: " + Joiner.on('\n').join(schemaTriples)); List<String> select = GenericRule.getSelect(schemaTriples); // block and wait for each job to complete then save results to a file BigInteger rows = BigInteger.ZERO; try { rows = cloud.saveBigQueryResultsToFile(term.getJobId(), term.getFilename()).getTotalRows(); } catch (IOException ioe) { // transient backend errors log.warn("failed to save query results to file, jobId: " + term.getJobId()); } log.info("Query found " + rows + ", rows"); // only process if triples are found matching this term if (!BigInteger.ZERO.equals(rows)) { int inferredTriplesCount = this.inferAndSaveTriplesToFile(term, select, schemaTriples, rows, decoratedTable, writer); productiveTerms.add(term.getTerm()); interimInferredTriples += inferredTriplesCount; this.totalRows = this.totalRows.add(rows); } } } totalInferredTriples += interimInferredTriples; if (interimInferredTriples > 0) { //TODO stream smaller numbers of inferred triples //TODO try uploading from cloud storage int streamingThreshold = Config.getIntegerProperty("ecarf.io.reasoning.streaming.threshold", 100000); log.info("Inserting " + interimInferredTriples + ", inferred triples into Big Data table for " + productiveTerms.size() + " productive terms. Filename: " + inferredTriplesFile); if (interimInferredTriples <= streamingThreshold) { // stream the data Set<Triple> inferredTriples = TripleUtils.loadCompressedCSVTriples(inferredTriplesFile, false); log.info("Total triples to stream into Big Data: " + inferredTriples.size()); cloud.streamObjectsIntoBigData(inferredTriples, TableUtils.getBigQueryTripleTable(table)); log.info("All inferred triples are streamed into Big Data table"); } else { // load the data through cloud storage // upload the file to cloud storage log.info("Uploading inferred triples file into cloud storage: " + inferredTriplesFile); StorageObject file = cloud.uploadFileToCloudStorage(inferredTriplesFile, bucket); log.info("File " + file + ", uploaded successfully. Now loading it into big data."); String jobId = cloud.loadCloudStorageFilesIntoBigData(Lists.newArrayList(file.getUri()), TableUtils.getBigQueryTripleTable(table), false); log.info( "All inferred triples are loaded into Big Data table through cloud storage, completed jobId: " + jobId); } // reset empty retries emptyRetries = 0; } else { log.info("No new inferred triples"); // increment empty retries emptyRetries++; } log.info("Total inferred triples so far = " + totalInferredTriples + ", current retry count: " + emptyRetries); ApiUtils.block(Config.getIntegerProperty(Constants.REASON_SLEEP_KEY, 20)); // FIXME move into the particular cloud implementation service long elapsed = System.currentTimeMillis() - start; decoratedTable = "[" + table + "@-" + elapsed + "-]"; log.info("Using table decorator: " + decoratedTable + ". Empty retries count: " + emptyRetries); } while (!(emptyRetries == maxRetries)); // end timestamp loop log.info("Finished reasoning, total inferred triples = " + totalInferredTriples); log.info("Number of avoided duplicate terms = " + this.duplicates); log.info("Total rows retrieved from big data = " + this.totalRows); }
From source file:io.ecarf.core.cloud.task.processor.reason.phase2.DoReasonTask7.java
@Override public void run() throws IOException { GoogleCloudService cloud = (GoogleCloudService) this.getCloudService(); Stopwatch stopwatch1 = Stopwatch.createUnstarted(); Stopwatch stopwatch2 = Stopwatch.createUnstarted(); Set<String> termsSet; if (terms == null) { // too large, probably saved as a file log.info("Using json file for terms: " + termsFile); Validate.notNull(termsFile);/*from w ww . ja v a 2 s . c o m*/ String localTermsFile = Utils.TEMP_FOLDER + termsFile; cloud.downloadObjectFromCloudStorage(termsFile, localTermsFile, bucket); // convert from JSON termsSet = io.cloudex.framework.utils.FileUtils.jsonFileToSet(localTermsFile); } else { termsSet = ObjectUtils.csvToSet(terms); } String localSchemaFile = Utils.TEMP_FOLDER + schemaFile; // download the file from the cloud storage cloud.downloadObjectFromCloudStorage(schemaFile, localSchemaFile, bucket); // uncompress if compressed if (GzipUtils.isCompressedFilename(schemaFile)) { localSchemaFile = GzipUtils.getUncompressedFilename(localSchemaFile); } Map<Long, Set<Triple>> allSchemaTriples = TripleUtils.getRelevantSchemaETriples(localSchemaFile, TermUtils.RDFS_TBOX); // get all the triples we care about schemaTerms = new HashMap<>(); for (String termStr : termsSet) { Long term = Long.parseLong(termStr); if (allSchemaTriples.containsKey(term)) { schemaTerms.put(term, allSchemaTriples.get(term)); } } String decoratedTable = table; int emptyRetries = 0; int totalInferredTriples = 0; int maxRetries = Config.getIntegerProperty(Constants.REASON_RETRY_KEY, 6); String instanceId = cloud.getInstanceId(); QueryGenerator<Long> generator = new QueryGenerator<Long>(schemaTerms, null); // timestamp loop do { Set<Long> productiveTerms = new HashSet<>(); int interimInferredTriples = 0; // First of all run all the queries asynchronously and remember the jobId and filename for each term List<QueryResult> queryResults = new ArrayList<QueryResult>(); generator.setDecoratedTable(decoratedTable); List<String> queries = generator.getQueries(); log.debug("Generated Queries: " + queries); String queryResultFilePrefix = Utils.TEMP_FOLDER + instanceId + '_' + System.currentTimeMillis() + "_QueryResults_"; int fileCount = 0; for (String query : queries) { String jobId = cloud.startBigDataQuery(query); queryResults .add(QueryResult.create().setFilename(queryResultFilePrefix + fileCount).setJobId(jobId)); fileCount++; } // invoke all the queries in parallel //this.invokeAll(queryTasks); long start = System.currentTimeMillis(); String inferredTriplesFile = Utils.TEMP_FOLDER + instanceId + '_' + start + Constants.DOT_INF; // save all the query results in files in parallel //this.invokeAll(saveTasks); for (QueryResult queryResult : queryResults) { try { // block and wait for each job to complete then save results to a file QueryStats stats = cloud.saveBigQueryResultsToFile(queryResult.getJobId(), queryResult.getFilename()); queryResult.setStats(stats); } catch (IOException ioe) { // transient backend errors log.warn("failed to save query results to file, jobId: " + queryResult.getJobId(), ioe); //TODO should throw an exception } } try (PrintWriter writer = new PrintWriter( new GZIPOutputStream(new FileOutputStream(inferredTriplesFile), Constants.GZIP_BUF_SIZE))) { // now loop through the queries //for(Entry<Term, Set<Triple>> entry: schemaTerms.entrySet()) { for (QueryResult queryResult : queryResults) { //Term term = entry.getKey(); QueryStats stats = queryResult.getStats(); BigInteger rows = stats.getTotalRows();//term.getRows(); this.totalBytes = this.totalBytes + stats.getTotalProcessedBytes();//term.getBytes(); // only process if triples are found matching this term if (!BigInteger.ZERO.equals(rows)) { stopwatch1.start(); int inferredTriplesCount = this.inferAndSaveTriplesToFile(queryResult, productiveTerms, decoratedTable, writer); interimInferredTriples += inferredTriplesCount; this.totalRows = this.totalRows.add(rows); stopwatch1.stop(); } else { log.info("Skipping query as no data is found"); } } } totalInferredTriples += interimInferredTriples; if (interimInferredTriples > 0) { // stream smaller numbers of inferred triples // try uploading from cloud storage int streamingThreshold = Config.getIntegerProperty("ecarf.io.reasoning.streaming.threshold", 100000); log.info("Inserting " + interimInferredTriples + ", inferred triples into Big Data table for " + productiveTerms.size() + " productive terms. Filename: " + inferredTriplesFile); if (interimInferredTriples <= streamingThreshold) { // stream the data Set<Triple> inferredTriples = TripleUtils.loadCompressedCSVTriples(inferredTriplesFile, true); log.info("Total triples to stream into Big Data: " + inferredTriples.size()); cloud.streamObjectsIntoBigData(inferredTriples, TableUtils.getBigQueryEncodedTripleTable(table)); log.info("All inferred triples are streamed into Big Data table"); } else { // load the data through cloud storage // upload the file to cloud storage log.info("Uploading inferred triples file into cloud storage: " + inferredTriplesFile); StorageObject file = cloud.uploadFileToCloudStorage(inferredTriplesFile, bucket); log.info("File " + file + ", uploaded successfully. Now loading it into big data."); String jobId = cloud.loadCloudStorageFilesIntoBigData(Lists.newArrayList(file.getUri()), TableUtils.getBigQueryEncodedTripleTable(table), false); log.info( "All inferred triples are loaded into Big Data table through cloud storage, completed jobId: " + jobId); } // reset empty retries emptyRetries = 0; stopwatch2.reset(); } else { log.info("No new inferred triples"); // increment empty retries emptyRetries++; if (!stopwatch2.isRunning()) { stopwatch2.start(); } } log.info("Total inferred triples so far = " + totalInferredTriples + ", current retry count: " + emptyRetries); if (emptyRetries < maxRetries) { ApiUtils.block(Config.getIntegerProperty(Constants.REASON_SLEEP_KEY, 20)); // FIXME move into the particular cloud implementation service long elapsed = System.currentTimeMillis() - start; decoratedTable = "[" + table + "@-" + elapsed + "-]"; log.info("Using table decorator: " + decoratedTable + ". Empty retries count: " + emptyRetries); } } while (emptyRetries < maxRetries); // end timestamp loop //executor.shutdown(); log.info("Finished reasoning, total inferred triples = " + totalInferredTriples); //log.info("Number of avoided duplicate terms = " + this.duplicates); log.info("Total rows retrieved from big data = " + this.totalRows); log.info("Total processed GBytes = " + ((double) this.totalBytes / FileUtils.ONE_GB)); log.info("Total process reasoning time (serialization in inf file) = " + stopwatch1); log.info("Total time spent in empty inference cycles = " + stopwatch2); }
From source file:io.ecarf.core.cloud.task.processor.reason.phase2.DoReasonTask6.java
@Override public void run() throws IOException { GoogleCloudService cloud = (GoogleCloudService) this.getCloudService(); //String table = metadata.getValue(EcarfMetaData.ECARF_TABLE); //Set<String> terms = metadata.getTerms(); //String schemaFile = metadata.getValue(EcarfMetaData.ECARF_SCHEMA); //String bucket = metadata.getBucket(); Stopwatch stopwatch1 = Stopwatch.createUnstarted(); Stopwatch stopwatch2 = Stopwatch.createUnstarted(); Set<String> termsSet; if (terms == null) { // too large, probably saved as a file //String termsFile = metadata.getValue(EcarfMetaData.ECARF_TERMS_FILE); log.info("Using json file for terms: " + termsFile); Validate.notNull(termsFile);//from w ww . j a v a 2 s . c o m String localTermsFile = Utils.TEMP_FOLDER + termsFile; cloud.downloadObjectFromCloudStorage(termsFile, localTermsFile, bucket); // convert from JSON termsSet = io.cloudex.framework.utils.FileUtils.jsonFileToSet(localTermsFile); } else { termsSet = ObjectUtils.csvToSet(terms); } String localSchemaFile = Utils.TEMP_FOLDER + schemaFile; // download the file from the cloud storage cloud.downloadObjectFromCloudStorage(schemaFile, localSchemaFile, bucket); // uncompress if compressed if (GzipUtils.isCompressedFilename(schemaFile)) { localSchemaFile = GzipUtils.getUncompressedFilename(localSchemaFile); } Map<String, Set<Triple>> allSchemaTriples = TripleUtils.getRelevantSchemaNTriples(localSchemaFile, TermUtils.RDFS_TBOX); // get all the triples we care about schemaTerms = new HashMap<>(); for (String term : termsSet) { if (allSchemaTriples.containsKey(term)) { schemaTerms.put(term, allSchemaTriples.get(term)); } } String decoratedTable = table; int emptyRetries = 0; int totalInferredTriples = 0; int maxRetries = Config.getIntegerProperty(Constants.REASON_RETRY_KEY, 6); String instanceId = cloud.getInstanceId(); QueryGenerator<String> generator = new QueryGenerator<String>(schemaTerms, null); // timestamp loop do { Set<String> productiveTerms = new HashSet<>(); int interimInferredTriples = 0; // First of all run all the queries asynchronously and remember the jobId and filename for each term List<QueryResult> queryResults = new ArrayList<QueryResult>(); generator.setDecoratedTable(decoratedTable); List<String> queries = generator.getQueries(); log.debug("Generated Queries: " + queries); String queryResultFilePrefix = Utils.TEMP_FOLDER + instanceId + '_' + System.currentTimeMillis() + "_QueryResults_"; int fileCount = 0; for (String query : queries) { String jobId = cloud.startBigDataQuery(query); queryResults .add(QueryResult.create().setFilename(queryResultFilePrefix + fileCount).setJobId(jobId)); fileCount++; } // invoke all the queries in parallel //this.invokeAll(queryTasks); long start = System.currentTimeMillis(); String inferredTriplesFile = Utils.TEMP_FOLDER + instanceId + '_' + start + Constants.DOT_INF; // save all the query results in files in parallel //this.invokeAll(saveTasks); for (QueryResult queryResult : queryResults) { try { // block and wait for each job to complete then save results to a file QueryStats stats = cloud.saveBigQueryResultsToFile(queryResult.getJobId(), queryResult.getFilename()); queryResult.setStats(stats); } catch (IOException ioe) { // transient backend errors log.warn("failed to save query results to file, jobId: " + queryResult.getJobId(), ioe); //TODO should throw an exception } } try (PrintWriter writer = new PrintWriter( new GZIPOutputStream(new FileOutputStream(inferredTriplesFile), Constants.GZIP_BUF_SIZE))) { // now loop through the queries //for(Entry<Term, Set<Triple>> entry: schemaTerms.entrySet()) { for (QueryResult queryResult : queryResults) { //Term term = entry.getKey(); QueryStats stats = queryResult.getStats(); BigInteger rows = stats.getTotalRows();//term.getRows(); this.totalBytes = this.totalBytes + stats.getTotalProcessedBytes();//term.getBytes(); // only process if triples are found matching this term if (!BigInteger.ZERO.equals(rows)) { stopwatch1.start(); int inferredTriplesCount = this.inferAndSaveTriplesToFile(queryResult, productiveTerms, decoratedTable, writer); interimInferredTriples += inferredTriplesCount; this.totalRows = this.totalRows.add(rows); stopwatch1.stop(); } else { log.info("Skipping query as no data is found"); } } } totalInferredTriples += interimInferredTriples; if (interimInferredTriples > 0) { // stream smaller numbers of inferred triples // try uploading from cloud storage int streamingThreshold = Config.getIntegerProperty("ecarf.io.reasoning.streaming.threshold", 100000); log.info("Inserting " + interimInferredTriples + ", inferred triples into Big Data table for " + productiveTerms.size() + " productive terms. Filename: " + inferredTriplesFile); if (interimInferredTriples <= streamingThreshold) { // stream the data Set<Triple> inferredTriples = TripleUtils.loadCompressedCSVTriples(inferredTriplesFile, false); log.info("Total triples to stream into Big Data: " + inferredTriples.size()); cloud.streamObjectsIntoBigData(inferredTriples, TableUtils.getBigQueryTripleTable(table)); log.info("All inferred triples are streamed into Big Data table"); } else { // load the data through cloud storage // upload the file to cloud storage log.info("Uploading inferred triples file into cloud storage: " + inferredTriplesFile); StorageObject file = cloud.uploadFileToCloudStorage(inferredTriplesFile, bucket); log.info("File " + file + ", uploaded successfully. Now loading it into big data."); String jobId = cloud.loadCloudStorageFilesIntoBigData(Lists.newArrayList(file.getUri()), TableUtils.getBigQueryTripleTable(table), false); log.info( "All inferred triples are loaded into Big Data table through cloud storage, completed jobId: " + jobId); } // reset empty retries emptyRetries = 0; stopwatch2.reset(); } else { log.info("No new inferred triples"); // increment empty retries emptyRetries++; if (!stopwatch2.isRunning()) { stopwatch2.start(); } } log.info("Total inferred triples so far = " + totalInferredTriples + ", current retry count: " + emptyRetries); if (emptyRetries < maxRetries) { ApiUtils.block(Config.getIntegerProperty(Constants.REASON_SLEEP_KEY, 20)); // FIXME move into the particular cloud implementation service long elapsed = System.currentTimeMillis() - start; decoratedTable = "[" + table + "@-" + elapsed + "-]"; log.info("Using table decorator: " + decoratedTable + ". Empty retries count: " + emptyRetries); } } while (emptyRetries < maxRetries); // end timestamp loop //executor.shutdown(); log.info("Finished reasoning, total inferred triples = " + totalInferredTriples); log.info("Number of avoided duplicate terms = " + this.duplicates); log.info("Total rows retrieved from big data = " + this.totalRows); log.info("Total processed GBytes = " + ((double) this.totalBytes / FileUtils.ONE_GB)); log.info("Total process reasoning time (serialization in inf file) = " + stopwatch1); log.info("Total time spent in empty inference cycles = " + stopwatch2); }
From source file:io.ecarf.core.compress.NTripleGzipProcessor.java
/** * Get a deflated stream from the provided input * @param input//www . j a v a 2 s . c om * @return * @throws IOException */ private InputStream getDeflatedStream(InputStream input) throws IOException { InputStream deflated = input; // gzip if (GzipUtils.isCompressedFilename(this.inputFile)) { deflated = new GZIPInputStream(input, Constants.GZIP_BUF_SIZE); } // bz2 else if (BZip2Utils.isCompressedFilename(this.inputFile)) { deflated = new BZip2CompressorInputStream(new BufferedInputStream(input)); } return deflated; }