Example usage for org.apache.commons.compress.compressors.gzip GzipUtils isCompressedFilename

List of usage examples for org.apache.commons.compress.compressors.gzip GzipUtils isCompressedFilename

Introduction

In this page you can find the example usage for org.apache.commons.compress.compressors.gzip GzipUtils isCompressedFilename.

Prototype

public static boolean isCompressedFilename(String filename) 

Source Link

Document

Detects common gzip suffixes in the given filename.

Usage

From source file:com.zenome.bundlebus.Util.java

public static boolean unZip(@NonNull final File aGzFile, @NonNull final File aOutputFile)
        throws FileNotFoundException, IOException {
    Log.d(TAG, "gz filename : " + aGzFile);
    Log.d(TAG, "output file : " + aOutputFile);

    if (!GzipUtils.isCompressedFilename(aGzFile.getAbsolutePath())) {
        Log.d(TAG, "This file is not compressed file : " + aGzFile.getAbsolutePath());
        return false;
    }/*from  w ww. j  a va 2s.c om*/

    final FileInputStream fis = new FileInputStream(aGzFile);
    BufferedInputStream in = new BufferedInputStream(fis);
    FileOutputStream out = new FileOutputStream(aOutputFile);

    GzipCompressorInputStream gzIn = new GzipCompressorInputStream(in);
    final byte[] buffer = new byte[4096];
    int n = 0;

    while (-1 != (n = gzIn.read(buffer))) {
        out.write(buffer, 0, n);
    }

    out.close();
    gzIn.close();

    return true;
}

From source file:net.sf.util.zip.FileNameUtil.java

/**
 *
 * @param name the file name//from   w ww . j a v a2  s. co m
 * @return whether the file is an valid compressed file or not (based on suffix)
 */
public static boolean isCompressedFile(String name) {
    if (BZip2Utils.isCompressedFilename(name) || GzipUtils.isCompressedFilename(name)
            || XZUtils.isCompressedFilename(name))
        return true;
    return false;
}

From source file:io.ecarf.core.cloud.task.coordinator.CountSchemaTermTask.java

@Override
public void run() throws IOException {

    log.info("Processing schema terms");

    //String schemaFile = metadata.getValue(EcarfMetaData.ECARF_SCHEMA);
    //String bucket = metadata.getBucket();
    // check if we have a source bucket for backward compatibility
    if (StringUtils.isBlank(sourceBucket)) {
        log.warn("sourceBucket is empty, using bucket: " + bucket);
        this.sourceBucket = bucket;
    }/*  w ww  .j  a va  2 s .  c  o  m*/

    String localSchemaFile = Utils.TEMP_FOLDER + schemaFile;
    // download the file from the cloud storage
    this.getCloudService().downloadObjectFromCloudStorage(schemaFile, localSchemaFile, sourceBucket);

    // uncompress if compressed
    if (GzipUtils.isCompressedFilename(schemaFile)) {
        localSchemaFile = GzipUtils.getUncompressedFilename(localSchemaFile);
    }

    Set<String> terms = TermUtils.getRelevantSchemaTerms(localSchemaFile, TermUtils.RDFS_TBOX);

    log.info("Total relevant schema terms: " + terms.size());

    String schemaTermsFile = Utils.TEMP_FOLDER + Constants.SCHEMA_TERMS_JSON;

    // save to file
    FileUtils.objectToJsonFile(schemaTermsFile, terms);

    // upload the file to cloud storage
    this.getCloudService().uploadFileToCloudStorage(schemaTermsFile, bucket);

    // add value to the output
    this.addOutput("schemaTermsFile", Constants.SCHEMA_TERMS_JSON);

    log.info("Successfully processed schema terms");
}

From source file:net.sf.util.zip.FileNameUtil.java

/**
 *
 * @param fileName  the file name//from   www.j  a  v a  2 s .com
 * @return   Compressed file type, as defined in CompressorStreamFactory
 */
public static String[] getCompressFileType(String fileName) {
    String s = fileName.toLowerCase();
    String[] ret = { null, null };
    if (GzipUtils.isCompressedFilename(s)) {
        ret[0] = CompressorStreamFactory.GZIP;
        ret[1] = GzipUtils.getUncompressedFilename(fileName);
    } else if (BZip2Utils.isCompressedFilename(s)) {
        ret[0] = CompressorStreamFactory.BZIP2;
        ret[1] = BZip2Utils.getUncompressedFilename(fileName);
    } else if (XZUtils.isCompressedFilename(s)) {
        ret[0] = CompressorStreamFactory.XZ;
        ret[1] = XZUtils.getUncompressedFilename(fileName);
    }

    return ret;
}

From source file:io.ecarf.core.cloud.task.processor.reason.phase2.DoReasonTask9.java

/**
 * Carryout the setup of the schema terms
 * @param cloud// www . j a v a 2s  .  c o  m
 * @throws IOException
 */
private void setup(GoogleCloudService cloud) throws IOException {

    Set<String> termsSet;

    if (terms == null) {
        // too large, probably saved as a file

        log.info("Using json file for terms: " + termsFile);
        Validate.notNull(termsFile);

        String localTermsFile = Utils.TEMP_FOLDER + termsFile;
        cloud.downloadObjectFromCloudStorage(termsFile, localTermsFile, bucket);

        // convert from JSON
        termsSet = io.cloudex.framework.utils.FileUtils.jsonFileToSet(localTermsFile);

    } else {
        termsSet = ObjectUtils.csvToSet(terms);
    }

    String localSchemaFile = Utils.TEMP_FOLDER + schemaFile;
    // download the file from the cloud storage
    cloud.downloadObjectFromCloudStorage(schemaFile, localSchemaFile, bucket);

    // uncompress if compressed
    if (GzipUtils.isCompressedFilename(schemaFile)) {
        localSchemaFile = GzipUtils.getUncompressedFilename(localSchemaFile);
    }

    Map<Long, Set<Triple>> allSchemaTriples = TripleUtils.getRelevantSchemaETriples(localSchemaFile,
            TermUtils.RDFS_TBOX);

    // get all the triples we care about
    schemaTerms = new HashMap<>();

    for (String termStr : termsSet) {

        Long term = Long.parseLong(termStr);

        if (allSchemaTriples.containsKey(term)) {
            schemaTerms.put(term, allSchemaTriples.get(term));
        }
    }

}

From source file:io.ecarf.core.cloud.task.processor.reason.phase0.DoReasonTask3.java

@Override
public void run() throws IOException {

    GoogleCloudService cloud = (GoogleCloudService) this.getCloudService();

    //String table = metadata.getValue(EcarfMetaData.ECARF_TABLE);
    //Set<String> terms = metadata.getTerms();
    //String schemaFile = metadata.getValue(EcarfMetaData.ECARF_SCHEMA);
    //String bucket = metadata.getBucket();
    Set<String> termsSet;

    if (terms == null) {
        // too large, probably saved as a file
        //String termsFile = metadata.getValue(EcarfMetaData.ECARF_TERMS_FILE);
        log.info("Using json file for terms: " + termsFile);
        Validate.notNull(termsFile);//from  ww w  .j  a  v a 2  s . c o m

        String localTermsFile = Utils.TEMP_FOLDER + termsFile;
        cloud.downloadObjectFromCloudStorage(termsFile, localTermsFile, bucket);

        // convert from JSON
        termsSet = FileUtils.jsonFileToSet(localTermsFile);

    } else {
        termsSet = ObjectUtils.csvToSet(terms);
    }

    String localSchemaFile = Utils.TEMP_FOLDER + schemaFile;
    // download the file from the cloud storage
    cloud.downloadObjectFromCloudStorage(schemaFile, localSchemaFile, bucket);

    // uncompress if compressed
    if (GzipUtils.isCompressedFilename(schemaFile)) {
        localSchemaFile = GzipUtils.getUncompressedFilename(localSchemaFile);
    }

    Map<String, Set<Triple>> allSchemaTriples = TripleUtils.getRelevantSchemaNTriples(localSchemaFile,
            TermUtils.RDFS_TBOX);

    // get all the triples we care about
    Map<Term, Set<Triple>> schemaTerms = new HashMap<>();

    for (String term : termsSet) {
        if (allSchemaTriples.containsKey(term)) {
            schemaTerms.put(new Term(term), allSchemaTriples.get(term));
        }
    }

    String decoratedTable = table;
    int emptyRetries = 0;
    int totalInferredTriples = 0;
    int maxRetries = Config.getIntegerProperty(Constants.REASON_RETRY_KEY, 6);

    // timestamp loop
    do {

        //List<String> inferredFiles = new ArrayList<>();

        // First of all run all the queries asynchronously and remember the jobId and filename for each term
        for (Entry<Term, Set<Triple>> entry : schemaTerms.entrySet()) {

            Term term = entry.getKey();

            // add table decoration to table name
            String query = GenericRule.getQuery(entry.getValue(), decoratedTable);

            log.info("\nQuery: " + query);

            String jobId = cloud.startBigDataQuery(query);
            String encodedTerm = FileUtils.encodeFilename(term.getTerm());
            String filename = Utils.TEMP_FOLDER + encodedTerm + Constants.DOT_TERMS;

            // remember the filename and the jobId for this query
            term.setFilename(filename).setJobId(jobId).setEncodedTerm(encodedTerm);

        }

        long start = System.currentTimeMillis();

        String inferredTriplesFile = Utils.TEMP_FOLDER + start + Constants.DOT_INF;

        List<String> productiveTerms = new ArrayList<>();

        int interimInferredTriples = 0;

        try (PrintWriter writer = new PrintWriter(
                new GZIPOutputStream(new FileOutputStream(inferredTriplesFile), Constants.GZIP_BUF_SIZE))) {

            // now loop through the queries
            for (Entry<Term, Set<Triple>> entry : schemaTerms.entrySet()) {

                Term term = entry.getKey();
                log.info("Reasoning for Term: " + term);

                Set<Triple> schemaTriples = entry.getValue();
                log.info("Schema Triples: " + Joiner.on('\n').join(schemaTriples));

                List<String> select = GenericRule.getSelect(schemaTriples);

                // block and wait for each job to complete then save results to a file
                BigInteger rows = BigInteger.ZERO;

                try {
                    rows = cloud.saveBigQueryResultsToFile(term.getJobId(), term.getFilename()).getTotalRows();

                } catch (IOException ioe) {
                    // transient backend errors
                    log.warn("failed to save query results to file, jobId: " + term.getJobId());
                }

                log.info("Query found " + rows + ", rows");

                // only process if triples are found matching this term
                if (!BigInteger.ZERO.equals(rows)) {

                    int inferredTriplesCount = this.inferAndSaveTriplesToFile(term, select, schemaTriples, rows,
                            decoratedTable, writer);

                    productiveTerms.add(term.getTerm());

                    interimInferredTriples += inferredTriplesCount;

                }
            }
        }

        totalInferredTriples += interimInferredTriples;

        if (interimInferredTriples > 0) {

            //TODO stream smaller numbers of inferred triples
            //TODO try uploading from cloud storage
            int streamingThreshold = Config.getIntegerProperty("ecarf.io.reasoning.streaming.threshold",
                    100000);

            log.info("Inserting " + interimInferredTriples + ", inferred triples into Big Data table for "
                    + productiveTerms.size() + " productive terms. Filename: " + inferredTriplesFile);

            if (interimInferredTriples <= streamingThreshold) {
                // stream the data

                Set<Triple> inferredTriples = TripleUtils.loadCompressedCSVTriples(inferredTriplesFile, false);
                log.info("Total triples to stream into Big Data: " + inferredTriples.size());
                cloud.streamObjectsIntoBigData(inferredTriples, TableUtils.getBigQueryTripleTable(table));

                log.info("All inferred triples are streamed into Big Data table");

            } else {

                // directly upload the data
                List<String> jobIds = cloud.loadLocalFilesIntoBigData(Lists.newArrayList(inferredTriplesFile),
                        TableUtils.getBigQueryTripleTable(table), false);
                log.info("All inferred triples are directly loaded into Big Data table, completed jobIds: "
                        + jobIds);
            }

            // reset empty retries
            emptyRetries = 0;

        } else {
            log.info("No new inferred triples");
            // increment empty retries
            emptyRetries++;
        }

        log.info("Total inferred triples so far = " + totalInferredTriples + ", current retry count: "
                + emptyRetries);

        ApiUtils.block(Config.getIntegerProperty(Constants.REASON_SLEEP_KEY, 20));

        // FIXME move into the particular cloud implementation service
        long elapsed = System.currentTimeMillis() - start;
        decoratedTable = "[" + table + "@-" + elapsed + "-]";

        log.info("Using table decorator: " + decoratedTable + ". Empty retries count: " + emptyRetries);

    } while (!(emptyRetries == maxRetries)); // end timestamp loop

    log.info("Finished reasoning, total inferred triples = " + totalInferredTriples);
}

From source file:io.ecarf.core.cloud.task.processor.reason.phase0.DoReasonTask4.java

@Override
public void run() throws IOException {

    GoogleCloudService cloud = (GoogleCloudService) this.getCloudService();

    //String table = metadata.getValue(EcarfMetaData.ECARF_TABLE);
    //Set<String> terms = metadata.getTerms();
    //String schemaFile = metadata.getValue(EcarfMetaData.ECARF_SCHEMA);
    //String bucket = metadata.getBucket();
    Set<String> termsSet;

    if (terms == null) {
        // too large, probably saved as a file
        //String termsFile = metadata.getValue(EcarfMetaData.ECARF_TERMS_FILE);
        log.info("Using json file for terms: " + termsFile);
        Validate.notNull(termsFile);//  w  w w .ja va2  s .co m

        String localTermsFile = Utils.TEMP_FOLDER + termsFile;
        cloud.downloadObjectFromCloudStorage(termsFile, localTermsFile, bucket);

        // convert from JSON
        termsSet = FileUtils.jsonFileToSet(localTermsFile);

    } else {
        termsSet = ObjectUtils.csvToSet(terms);
    }

    String localSchemaFile = Utils.TEMP_FOLDER + schemaFile;
    // download the file from the cloud storage
    cloud.downloadObjectFromCloudStorage(schemaFile, localSchemaFile, bucket);

    // uncompress if compressed
    if (GzipUtils.isCompressedFilename(schemaFile)) {
        localSchemaFile = GzipUtils.getUncompressedFilename(localSchemaFile);
    }

    Map<String, Set<Triple>> allSchemaTriples = TripleUtils.getRelevantSchemaNTriples(localSchemaFile,
            TermUtils.RDFS_TBOX);

    // get all the triples we care about
    Map<Term, Set<Triple>> schemaTerms = new HashMap<>();

    for (String term : termsSet) {
        if (allSchemaTriples.containsKey(term)) {
            schemaTerms.put(new Term(term), allSchemaTriples.get(term));
        }
    }

    String decoratedTable = table;
    int emptyRetries = 0;
    int totalInferredTriples = 0;
    int maxRetries = Config.getIntegerProperty(Constants.REASON_RETRY_KEY, 6);
    String instanceId = cloud.getInstanceId();

    // timestamp loop
    do {

        //List<String> inferredFiles = new ArrayList<>();

        // First of all run all the queries asynchronously and remember the jobId and filename for each term
        for (Entry<Term, Set<Triple>> entry : schemaTerms.entrySet()) {

            Term term = entry.getKey();

            // add table decoration to table name
            String query = GenericRule.getQuery(entry.getValue(), decoratedTable);

            log.info("\nQuery: " + query);

            String jobId = cloud.startBigDataQuery(query);
            String encodedTerm = FileUtils.encodeFilename(term.getTerm());
            String filename = Utils.TEMP_FOLDER + encodedTerm + Constants.DOT_TERMS;

            // remember the filename and the jobId for this query
            term.setFilename(filename).setJobId(jobId).setEncodedTerm(encodedTerm);

        }

        long start = System.currentTimeMillis();

        String inferredTriplesFile = Utils.TEMP_FOLDER + instanceId + '_' + start + Constants.DOT_INF;

        List<String> productiveTerms = new ArrayList<>();

        int interimInferredTriples = 0;

        try (PrintWriter writer = new PrintWriter(
                new GZIPOutputStream(new FileOutputStream(inferredTriplesFile), Constants.GZIP_BUF_SIZE))) {

            // now loop through the queries
            for (Entry<Term, Set<Triple>> entry : schemaTerms.entrySet()) {

                Term term = entry.getKey();
                log.info("Reasoning for Term: " + term);

                Set<Triple> schemaTriples = entry.getValue();
                log.info("Schema Triples: " + Joiner.on('\n').join(schemaTriples));

                List<String> select = GenericRule.getSelect(schemaTriples);

                // block and wait for each job to complete then save results to a file
                BigInteger rows = BigInteger.ZERO;

                try {
                    rows = cloud.saveBigQueryResultsToFile(term.getJobId(), term.getFilename()).getTotalRows();

                } catch (IOException ioe) {
                    // transient backend errors
                    log.warn("failed to save query results to file, jobId: " + term.getJobId());
                }

                log.info("Query found " + rows + ", rows");

                // only process if triples are found matching this term
                if (!BigInteger.ZERO.equals(rows)) {

                    int inferredTriplesCount = this.inferAndSaveTriplesToFile(term, select, schemaTriples, rows,
                            decoratedTable, writer);

                    productiveTerms.add(term.getTerm());

                    interimInferredTriples += inferredTriplesCount;

                    this.totalRows = this.totalRows.add(rows);

                }
            }
        }

        totalInferredTriples += interimInferredTriples;

        if (interimInferredTriples > 0) {

            //TODO stream smaller numbers of inferred triples
            //TODO try uploading from cloud storage
            int streamingThreshold = Config.getIntegerProperty("ecarf.io.reasoning.streaming.threshold",
                    100000);

            log.info("Inserting " + interimInferredTriples + ", inferred triples into Big Data table for "
                    + productiveTerms.size() + " productive terms. Filename: " + inferredTriplesFile);

            if (interimInferredTriples <= streamingThreshold) {
                // stream the data

                Set<Triple> inferredTriples = TripleUtils.loadCompressedCSVTriples(inferredTriplesFile, false);
                log.info("Total triples to stream into Big Data: " + inferredTriples.size());
                cloud.streamObjectsIntoBigData(inferredTriples, TableUtils.getBigQueryTripleTable(table));

                log.info("All inferred triples are streamed into Big Data table");

            } else {

                // load the data through cloud storage
                // upload the file to cloud storage
                log.info("Uploading inferred triples file into cloud storage: " + inferredTriplesFile);
                StorageObject file = cloud.uploadFileToCloudStorage(inferredTriplesFile, bucket);
                log.info("File " + file + ", uploaded successfully. Now loading it into big data.");

                String jobId = cloud.loadCloudStorageFilesIntoBigData(Lists.newArrayList(file.getUri()),
                        TableUtils.getBigQueryTripleTable(table), false);
                log.info(
                        "All inferred triples are loaded into Big Data table through cloud storage, completed jobId: "
                                + jobId);

            }

            // reset empty retries
            emptyRetries = 0;

        } else {
            log.info("No new inferred triples");
            // increment empty retries
            emptyRetries++;
        }

        log.info("Total inferred triples so far = " + totalInferredTriples + ", current retry count: "
                + emptyRetries);

        ApiUtils.block(Config.getIntegerProperty(Constants.REASON_SLEEP_KEY, 20));

        // FIXME move into the particular cloud implementation service
        long elapsed = System.currentTimeMillis() - start;
        decoratedTable = "[" + table + "@-" + elapsed + "-]";

        log.info("Using table decorator: " + decoratedTable + ". Empty retries count: " + emptyRetries);

    } while (!(emptyRetries == maxRetries)); // end timestamp loop

    log.info("Finished reasoning, total inferred triples = " + totalInferredTriples);
    log.info("Number of avoided duplicate terms = " + this.duplicates);
    log.info("Total rows retrieved from big data = " + this.totalRows);
}

From source file:io.ecarf.core.cloud.task.processor.reason.phase2.DoReasonTask7.java

@Override
public void run() throws IOException {

    GoogleCloudService cloud = (GoogleCloudService) this.getCloudService();

    Stopwatch stopwatch1 = Stopwatch.createUnstarted();
    Stopwatch stopwatch2 = Stopwatch.createUnstarted();
    Set<String> termsSet;

    if (terms == null) {
        // too large, probably saved as a file

        log.info("Using json file for terms: " + termsFile);
        Validate.notNull(termsFile);/*from w ww  .  ja  v  a 2 s . c o  m*/

        String localTermsFile = Utils.TEMP_FOLDER + termsFile;
        cloud.downloadObjectFromCloudStorage(termsFile, localTermsFile, bucket);

        // convert from JSON
        termsSet = io.cloudex.framework.utils.FileUtils.jsonFileToSet(localTermsFile);

    } else {
        termsSet = ObjectUtils.csvToSet(terms);
    }

    String localSchemaFile = Utils.TEMP_FOLDER + schemaFile;
    // download the file from the cloud storage
    cloud.downloadObjectFromCloudStorage(schemaFile, localSchemaFile, bucket);

    // uncompress if compressed
    if (GzipUtils.isCompressedFilename(schemaFile)) {
        localSchemaFile = GzipUtils.getUncompressedFilename(localSchemaFile);
    }

    Map<Long, Set<Triple>> allSchemaTriples = TripleUtils.getRelevantSchemaETriples(localSchemaFile,
            TermUtils.RDFS_TBOX);

    // get all the triples we care about
    schemaTerms = new HashMap<>();

    for (String termStr : termsSet) {

        Long term = Long.parseLong(termStr);

        if (allSchemaTriples.containsKey(term)) {
            schemaTerms.put(term, allSchemaTriples.get(term));
        }
    }

    String decoratedTable = table;
    int emptyRetries = 0;
    int totalInferredTriples = 0;
    int maxRetries = Config.getIntegerProperty(Constants.REASON_RETRY_KEY, 6);
    String instanceId = cloud.getInstanceId();

    QueryGenerator<Long> generator = new QueryGenerator<Long>(schemaTerms, null);

    // timestamp loop
    do {

        Set<Long> productiveTerms = new HashSet<>();
        int interimInferredTriples = 0;

        // First of all run all the queries asynchronously and remember the jobId and filename for each term

        List<QueryResult> queryResults = new ArrayList<QueryResult>();
        generator.setDecoratedTable(decoratedTable);

        List<String> queries = generator.getQueries();
        log.debug("Generated Queries: " + queries);
        String queryResultFilePrefix = Utils.TEMP_FOLDER + instanceId + '_' + System.currentTimeMillis()
                + "_QueryResults_";
        int fileCount = 0;
        for (String query : queries) {
            String jobId = cloud.startBigDataQuery(query);
            queryResults
                    .add(QueryResult.create().setFilename(queryResultFilePrefix + fileCount).setJobId(jobId));
            fileCount++;
        }

        // invoke all the queries in parallel
        //this.invokeAll(queryTasks);

        long start = System.currentTimeMillis();

        String inferredTriplesFile = Utils.TEMP_FOLDER + instanceId + '_' + start + Constants.DOT_INF;

        // save all the query results in files in parallel
        //this.invokeAll(saveTasks);

        for (QueryResult queryResult : queryResults) {
            try {
                // block and wait for each job to complete then save results to a file
                QueryStats stats = cloud.saveBigQueryResultsToFile(queryResult.getJobId(),
                        queryResult.getFilename());
                queryResult.setStats(stats);

            } catch (IOException ioe) {
                // transient backend errors
                log.warn("failed to save query results to file, jobId: " + queryResult.getJobId(), ioe);
                //TODO should throw an exception
            }
        }

        try (PrintWriter writer = new PrintWriter(
                new GZIPOutputStream(new FileOutputStream(inferredTriplesFile), Constants.GZIP_BUF_SIZE))) {

            // now loop through the queries
            //for(Entry<Term, Set<Triple>> entry: schemaTerms.entrySet()) {
            for (QueryResult queryResult : queryResults) {

                //Term term = entry.getKey();
                QueryStats stats = queryResult.getStats();

                BigInteger rows = stats.getTotalRows();//term.getRows();

                this.totalBytes = this.totalBytes + stats.getTotalProcessedBytes();//term.getBytes();

                // only process if triples are found matching this term
                if (!BigInteger.ZERO.equals(rows)) {

                    stopwatch1.start();

                    int inferredTriplesCount = this.inferAndSaveTriplesToFile(queryResult, productiveTerms,
                            decoratedTable, writer);

                    interimInferredTriples += inferredTriplesCount;

                    this.totalRows = this.totalRows.add(rows);

                    stopwatch1.stop();

                } else {
                    log.info("Skipping query as no data is found");
                }
            }
        }

        totalInferredTriples += interimInferredTriples;

        if (interimInferredTriples > 0) {

            // stream smaller numbers of inferred triples
            // try uploading from cloud storage
            int streamingThreshold = Config.getIntegerProperty("ecarf.io.reasoning.streaming.threshold",
                    100000);

            log.info("Inserting " + interimInferredTriples + ", inferred triples into Big Data table for "
                    + productiveTerms.size() + " productive terms. Filename: " + inferredTriplesFile);

            if (interimInferredTriples <= streamingThreshold) {
                // stream the data

                Set<Triple> inferredTriples = TripleUtils.loadCompressedCSVTriples(inferredTriplesFile, true);
                log.info("Total triples to stream into Big Data: " + inferredTriples.size());
                cloud.streamObjectsIntoBigData(inferredTriples,
                        TableUtils.getBigQueryEncodedTripleTable(table));

                log.info("All inferred triples are streamed into Big Data table");

            } else {

                // load the data through cloud storage
                // upload the file to cloud storage
                log.info("Uploading inferred triples file into cloud storage: " + inferredTriplesFile);
                StorageObject file = cloud.uploadFileToCloudStorage(inferredTriplesFile, bucket);
                log.info("File " + file + ", uploaded successfully. Now loading it into big data.");

                String jobId = cloud.loadCloudStorageFilesIntoBigData(Lists.newArrayList(file.getUri()),
                        TableUtils.getBigQueryEncodedTripleTable(table), false);
                log.info(
                        "All inferred triples are loaded into Big Data table through cloud storage, completed jobId: "
                                + jobId);

            }

            // reset empty retries
            emptyRetries = 0;

            stopwatch2.reset();

        } else {
            log.info("No new inferred triples");
            // increment empty retries
            emptyRetries++;

            if (!stopwatch2.isRunning()) {
                stopwatch2.start();
            }
        }

        log.info("Total inferred triples so far = " + totalInferredTriples + ", current retry count: "
                + emptyRetries);

        if (emptyRetries < maxRetries) {
            ApiUtils.block(Config.getIntegerProperty(Constants.REASON_SLEEP_KEY, 20));

            // FIXME move into the particular cloud implementation service
            long elapsed = System.currentTimeMillis() - start;
            decoratedTable = "[" + table + "@-" + elapsed + "-]";

            log.info("Using table decorator: " + decoratedTable + ". Empty retries count: " + emptyRetries);
        }

    } while (emptyRetries < maxRetries); // end timestamp loop

    //executor.shutdown();
    log.info("Finished reasoning, total inferred triples = " + totalInferredTriples);
    //log.info("Number of avoided duplicate terms = " + this.duplicates);
    log.info("Total rows retrieved from big data = " + this.totalRows);
    log.info("Total processed GBytes = " + ((double) this.totalBytes / FileUtils.ONE_GB));
    log.info("Total process reasoning time (serialization in inf file) = " + stopwatch1);
    log.info("Total time spent in empty inference cycles = " + stopwatch2);
}

From source file:io.ecarf.core.cloud.task.processor.reason.phase2.DoReasonTask6.java

@Override
public void run() throws IOException {

    GoogleCloudService cloud = (GoogleCloudService) this.getCloudService();

    //String table = metadata.getValue(EcarfMetaData.ECARF_TABLE);
    //Set<String> terms = metadata.getTerms();
    //String schemaFile = metadata.getValue(EcarfMetaData.ECARF_SCHEMA);
    //String bucket = metadata.getBucket();
    Stopwatch stopwatch1 = Stopwatch.createUnstarted();
    Stopwatch stopwatch2 = Stopwatch.createUnstarted();
    Set<String> termsSet;

    if (terms == null) {
        // too large, probably saved as a file
        //String termsFile = metadata.getValue(EcarfMetaData.ECARF_TERMS_FILE);
        log.info("Using json file for terms: " + termsFile);
        Validate.notNull(termsFile);//from   w ww .  j a v a 2 s  .  c o m

        String localTermsFile = Utils.TEMP_FOLDER + termsFile;
        cloud.downloadObjectFromCloudStorage(termsFile, localTermsFile, bucket);

        // convert from JSON
        termsSet = io.cloudex.framework.utils.FileUtils.jsonFileToSet(localTermsFile);

    } else {
        termsSet = ObjectUtils.csvToSet(terms);
    }

    String localSchemaFile = Utils.TEMP_FOLDER + schemaFile;
    // download the file from the cloud storage
    cloud.downloadObjectFromCloudStorage(schemaFile, localSchemaFile, bucket);

    // uncompress if compressed
    if (GzipUtils.isCompressedFilename(schemaFile)) {
        localSchemaFile = GzipUtils.getUncompressedFilename(localSchemaFile);
    }

    Map<String, Set<Triple>> allSchemaTriples = TripleUtils.getRelevantSchemaNTriples(localSchemaFile,
            TermUtils.RDFS_TBOX);

    // get all the triples we care about
    schemaTerms = new HashMap<>();

    for (String term : termsSet) {
        if (allSchemaTriples.containsKey(term)) {
            schemaTerms.put(term, allSchemaTriples.get(term));
        }
    }

    String decoratedTable = table;
    int emptyRetries = 0;
    int totalInferredTriples = 0;
    int maxRetries = Config.getIntegerProperty(Constants.REASON_RETRY_KEY, 6);
    String instanceId = cloud.getInstanceId();

    QueryGenerator<String> generator = new QueryGenerator<String>(schemaTerms, null);

    // timestamp loop
    do {

        Set<String> productiveTerms = new HashSet<>();
        int interimInferredTriples = 0;

        // First of all run all the queries asynchronously and remember the jobId and filename for each term

        List<QueryResult> queryResults = new ArrayList<QueryResult>();
        generator.setDecoratedTable(decoratedTable);

        List<String> queries = generator.getQueries();
        log.debug("Generated Queries: " + queries);
        String queryResultFilePrefix = Utils.TEMP_FOLDER + instanceId + '_' + System.currentTimeMillis()
                + "_QueryResults_";
        int fileCount = 0;
        for (String query : queries) {
            String jobId = cloud.startBigDataQuery(query);
            queryResults
                    .add(QueryResult.create().setFilename(queryResultFilePrefix + fileCount).setJobId(jobId));
            fileCount++;
        }

        // invoke all the queries in parallel
        //this.invokeAll(queryTasks);

        long start = System.currentTimeMillis();

        String inferredTriplesFile = Utils.TEMP_FOLDER + instanceId + '_' + start + Constants.DOT_INF;

        // save all the query results in files in parallel
        //this.invokeAll(saveTasks);

        for (QueryResult queryResult : queryResults) {
            try {
                // block and wait for each job to complete then save results to a file
                QueryStats stats = cloud.saveBigQueryResultsToFile(queryResult.getJobId(),
                        queryResult.getFilename());
                queryResult.setStats(stats);

            } catch (IOException ioe) {
                // transient backend errors
                log.warn("failed to save query results to file, jobId: " + queryResult.getJobId(), ioe);
                //TODO should throw an exception
            }
        }

        try (PrintWriter writer = new PrintWriter(
                new GZIPOutputStream(new FileOutputStream(inferredTriplesFile), Constants.GZIP_BUF_SIZE))) {

            // now loop through the queries
            //for(Entry<Term, Set<Triple>> entry: schemaTerms.entrySet()) {
            for (QueryResult queryResult : queryResults) {

                //Term term = entry.getKey();
                QueryStats stats = queryResult.getStats();

                BigInteger rows = stats.getTotalRows();//term.getRows();

                this.totalBytes = this.totalBytes + stats.getTotalProcessedBytes();//term.getBytes();

                // only process if triples are found matching this term
                if (!BigInteger.ZERO.equals(rows)) {

                    stopwatch1.start();

                    int inferredTriplesCount = this.inferAndSaveTriplesToFile(queryResult, productiveTerms,
                            decoratedTable, writer);

                    interimInferredTriples += inferredTriplesCount;

                    this.totalRows = this.totalRows.add(rows);

                    stopwatch1.stop();

                } else {
                    log.info("Skipping query as no data is found");
                }
            }
        }

        totalInferredTriples += interimInferredTriples;

        if (interimInferredTriples > 0) {

            // stream smaller numbers of inferred triples
            // try uploading from cloud storage
            int streamingThreshold = Config.getIntegerProperty("ecarf.io.reasoning.streaming.threshold",
                    100000);

            log.info("Inserting " + interimInferredTriples + ", inferred triples into Big Data table for "
                    + productiveTerms.size() + " productive terms. Filename: " + inferredTriplesFile);

            if (interimInferredTriples <= streamingThreshold) {
                // stream the data

                Set<Triple> inferredTriples = TripleUtils.loadCompressedCSVTriples(inferredTriplesFile, false);
                log.info("Total triples to stream into Big Data: " + inferredTriples.size());
                cloud.streamObjectsIntoBigData(inferredTriples, TableUtils.getBigQueryTripleTable(table));

                log.info("All inferred triples are streamed into Big Data table");

            } else {

                // load the data through cloud storage
                // upload the file to cloud storage
                log.info("Uploading inferred triples file into cloud storage: " + inferredTriplesFile);
                StorageObject file = cloud.uploadFileToCloudStorage(inferredTriplesFile, bucket);
                log.info("File " + file + ", uploaded successfully. Now loading it into big data.");

                String jobId = cloud.loadCloudStorageFilesIntoBigData(Lists.newArrayList(file.getUri()),
                        TableUtils.getBigQueryTripleTable(table), false);
                log.info(
                        "All inferred triples are loaded into Big Data table through cloud storage, completed jobId: "
                                + jobId);

            }

            // reset empty retries
            emptyRetries = 0;

            stopwatch2.reset();

        } else {
            log.info("No new inferred triples");
            // increment empty retries
            emptyRetries++;

            if (!stopwatch2.isRunning()) {
                stopwatch2.start();
            }
        }

        log.info("Total inferred triples so far = " + totalInferredTriples + ", current retry count: "
                + emptyRetries);

        if (emptyRetries < maxRetries) {
            ApiUtils.block(Config.getIntegerProperty(Constants.REASON_SLEEP_KEY, 20));

            // FIXME move into the particular cloud implementation service
            long elapsed = System.currentTimeMillis() - start;
            decoratedTable = "[" + table + "@-" + elapsed + "-]";

            log.info("Using table decorator: " + decoratedTable + ". Empty retries count: " + emptyRetries);
        }

    } while (emptyRetries < maxRetries); // end timestamp loop

    //executor.shutdown();
    log.info("Finished reasoning, total inferred triples = " + totalInferredTriples);
    log.info("Number of avoided duplicate terms = " + this.duplicates);
    log.info("Total rows retrieved from big data = " + this.totalRows);
    log.info("Total processed GBytes = " + ((double) this.totalBytes / FileUtils.ONE_GB));
    log.info("Total process reasoning time (serialization in inf file) = " + stopwatch1);
    log.info("Total time spent in empty inference cycles = " + stopwatch2);
}

From source file:io.ecarf.core.compress.NTripleGzipProcessor.java

/**
 * Get a deflated stream from the provided input
 * @param input//www  .  j a  v  a  2  s  . c om
 * @return
 * @throws IOException
 */
private InputStream getDeflatedStream(InputStream input) throws IOException {

    InputStream deflated = input;

    // gzip
    if (GzipUtils.isCompressedFilename(this.inputFile)) {
        deflated = new GZIPInputStream(input, Constants.GZIP_BUF_SIZE);

    }
    // bz2
    else if (BZip2Utils.isCompressedFilename(this.inputFile)) {
        deflated = new BZip2CompressorInputStream(new BufferedInputStream(input));
    }

    return deflated;
}