Example usage for org.apache.commons.compress.compressors.gzip GzipUtils getUncompressedFilename

Introduction

In this page you can find the example usage for org.apache.commons.compress.compressors.gzip GzipUtils getUncompressedFilename.

Prototype

public static String getUncompressedFilename(String filename)

Source Link

Document

Maps the given name of a gzip-compressed file to the name that the file should have after uncompression.

Usage

From source file:com.dubture.symfony.core.util.UncompressUtils.java

/**
 * Uncompress a gzip archive and returns the file where it has been
 * extracted./*  www .j a va2  s  . c om*/
 *
 * @param archiveFile The archive file to uncompress
 * @param outputDirectory The output directory where to put the uncompressed archive
 *
 * @return The output file where the archive has been uncompressed
 *
 * @throws IOException When a problem occurs with either the input or output stream
 */
public static File uncompressGzipArchive(File archiveFile, File outputDirectory) throws IOException {
    FileInputStream fileInputStream = new FileInputStream(archiveFile);
    BufferedInputStream bufferedInputStream = new BufferedInputStream(fileInputStream);
    GzipCompressorInputStream gzipInputStream = new GzipCompressorInputStream(bufferedInputStream);

    String tarArchiveFilename = GzipUtils.getUncompressedFilename(archiveFile.getName());
    File outputFile = new File(outputDirectory, tarArchiveFilename);
    FileOutputStream outputStream = new FileOutputStream(outputFile);

    int byteReadCount = 0;
    final byte[] data = new byte[BUFFER_SIZE];

    try {
        while ((byteReadCount = gzipInputStream.read(data, 0, BUFFER_SIZE)) != -1) {
            outputStream.write(data, 0, byteReadCount);
        }
    } finally {
        outputStream.close();
        gzipInputStream.close();
    }

    return outputFile;
}

From source file:io.ecarf.core.cloud.task.coordinator.CountSchemaTermTask.java

@Override
public void run() throws IOException {

    log.info("Processing schema terms");

    //String schemaFile = metadata.getValue(EcarfMetaData.ECARF_SCHEMA);
    //String bucket = metadata.getBucket();
    // check if we have a source bucket for backward compatibility
    if (StringUtils.isBlank(sourceBucket)) {
        log.warn("sourceBucket is empty, using bucket: " + bucket);
        this.sourceBucket = bucket;
    }/*w w w.java 2 s . co m*/

    String localSchemaFile = Utils.TEMP_FOLDER + schemaFile;
    // download the file from the cloud storage
    this.getCloudService().downloadObjectFromCloudStorage(schemaFile, localSchemaFile, sourceBucket);

    // uncompress if compressed
    if (GzipUtils.isCompressedFilename(schemaFile)) {
        localSchemaFile = GzipUtils.getUncompressedFilename(localSchemaFile);
    }

    Set<String> terms = TermUtils.getRelevantSchemaTerms(localSchemaFile, TermUtils.RDFS_TBOX);

    log.info("Total relevant schema terms: " + terms.size());

    String schemaTermsFile = Utils.TEMP_FOLDER + Constants.SCHEMA_TERMS_JSON;

    // save to file
    FileUtils.objectToJsonFile(schemaTermsFile, terms);

    // upload the file to cloud storage
    this.getCloudService().uploadFileToCloudStorage(schemaTermsFile, bucket);

    // add value to the output
    this.addOutput("schemaTermsFile", Constants.SCHEMA_TERMS_JSON);

    log.info("Successfully processed schema terms");
}

From source file:net.sf.util.zip.FileNameUtil.java

/**
 *
 * @param fileName  the file name/*from  w  w w .  j  ava2s.  co  m*/
 * @return   Compressed file type, as defined in CompressorStreamFactory
 */
public static String[] getCompressFileType(String fileName) {
    String s = fileName.toLowerCase();
    String[] ret = { null, null };
    if (GzipUtils.isCompressedFilename(s)) {
        ret[0] = CompressorStreamFactory.GZIP;
        ret[1] = GzipUtils.getUncompressedFilename(fileName);
    } else if (BZip2Utils.isCompressedFilename(s)) {
        ret[0] = CompressorStreamFactory.BZIP2;
        ret[1] = BZip2Utils.getUncompressedFilename(fileName);
    } else if (XZUtils.isCompressedFilename(s)) {
        ret[0] = CompressorStreamFactory.XZ;
        ret[1] = XZUtils.getUncompressedFilename(fileName);
    }

    return ret;
}

From source file:com.sangupta.jerry.util.ArchiveUtils.java

/**
 * Unpack the Gzip GZ file into the given directory.
 * //from w  ww .ja  v  a2 s.com
 * @param gzipFile
 *            the file that needs to be unpacked
 * 
 * @param outputDir
 *            the directory in which the archive is unpacked
 * 
 * @throws FileNotFoundException
 *             if the GZ file is not found
 * 
 * @return a list of {@link File} objects representing the files unpacked
 *         from the TAR file
 */
public static List<File> unpackGZIP(final File gzipFile, final File outputDir) throws IOException {
    String outputFileName = GzipUtils.getUncompressedFilename(gzipFile.getName());
    File outputFile = new File(outputDir, outputFileName);

    FileInputStream fin = null;
    BufferedInputStream in = null;
    FileOutputStream out = null;
    GzipCompressorInputStream gzIn = null;

    try {
        fin = new FileInputStream(gzipFile);
        in = new BufferedInputStream(fin);
        out = new FileOutputStream(outputFile);
        gzIn = new GzipCompressorInputStream(in);

        org.apache.commons.io.IOUtils.copy(gzIn, out);
    } finally {
        org.apache.commons.io.IOUtils.closeQuietly(gzIn);
        org.apache.commons.io.IOUtils.closeQuietly(out);
        org.apache.commons.io.IOUtils.closeQuietly(in);
        org.apache.commons.io.IOUtils.closeQuietly(fin);
    }

    List<File> files = new ArrayList<File>();
    files.add(outputFile);

    return files;
}

From source file:io.ecarf.core.cloud.task.processor.reason.phase2.DoReasonTask9.java

/**
 * Carryout the setup of the schema terms
 * @param cloud/*from  w ww .ja  va  2 s  .c o m*/
 * @throws IOException
 */
private void setup(GoogleCloudService cloud) throws IOException {

    Set<String> termsSet;

    if (terms == null) {
        // too large, probably saved as a file

        log.info("Using json file for terms: " + termsFile);
        Validate.notNull(termsFile);

        String localTermsFile = Utils.TEMP_FOLDER + termsFile;
        cloud.downloadObjectFromCloudStorage(termsFile, localTermsFile, bucket);

        // convert from JSON
        termsSet = io.cloudex.framework.utils.FileUtils.jsonFileToSet(localTermsFile);

    } else {
        termsSet = ObjectUtils.csvToSet(terms);
    }

    String localSchemaFile = Utils.TEMP_FOLDER + schemaFile;
    // download the file from the cloud storage
    cloud.downloadObjectFromCloudStorage(schemaFile, localSchemaFile, bucket);

    // uncompress if compressed
    if (GzipUtils.isCompressedFilename(schemaFile)) {
        localSchemaFile = GzipUtils.getUncompressedFilename(localSchemaFile);
    }

    Map<Long, Set<Triple>> allSchemaTriples = TripleUtils.getRelevantSchemaETriples(localSchemaFile,
            TermUtils.RDFS_TBOX);

    // get all the triples we care about
    schemaTerms = new HashMap<>();

    for (String termStr : termsSet) {

        Long term = Long.parseLong(termStr);

        if (allSchemaTriples.containsKey(term)) {
            schemaTerms.put(term, allSchemaTriples.get(term));
        }
    }

}

From source file:io.ecarf.core.cloud.task.processor.reason.phase0.DoReasonTask3.java

@Override
public void run() throws IOException {

    GoogleCloudService cloud = (GoogleCloudService) this.getCloudService();

    //String table = metadata.getValue(EcarfMetaData.ECARF_TABLE);
    //Set<String> terms = metadata.getTerms();
    //String schemaFile = metadata.getValue(EcarfMetaData.ECARF_SCHEMA);
    //String bucket = metadata.getBucket();
    Set<String> termsSet;

    if (terms == null) {
        // too large, probably saved as a file
        //String termsFile = metadata.getValue(EcarfMetaData.ECARF_TERMS_FILE);
        log.info("Using json file for terms: " + termsFile);
        Validate.notNull(termsFile);/*ww  w .j  av a2s.  co  m*/

        String localTermsFile = Utils.TEMP_FOLDER + termsFile;
        cloud.downloadObjectFromCloudStorage(termsFile, localTermsFile, bucket);

        // convert from JSON
        termsSet = FileUtils.jsonFileToSet(localTermsFile);

    } else {
        termsSet = ObjectUtils.csvToSet(terms);
    }

    String localSchemaFile = Utils.TEMP_FOLDER + schemaFile;
    // download the file from the cloud storage
    cloud.downloadObjectFromCloudStorage(schemaFile, localSchemaFile, bucket);

    // uncompress if compressed
    if (GzipUtils.isCompressedFilename(schemaFile)) {
        localSchemaFile = GzipUtils.getUncompressedFilename(localSchemaFile);
    }

    Map<String, Set<Triple>> allSchemaTriples = TripleUtils.getRelevantSchemaNTriples(localSchemaFile,
            TermUtils.RDFS_TBOX);

    // get all the triples we care about
    Map<Term, Set<Triple>> schemaTerms = new HashMap<>();

    for (String term : termsSet) {
        if (allSchemaTriples.containsKey(term)) {
            schemaTerms.put(new Term(term), allSchemaTriples.get(term));
        }
    }

    String decoratedTable = table;
    int emptyRetries = 0;
    int totalInferredTriples = 0;
    int maxRetries = Config.getIntegerProperty(Constants.REASON_RETRY_KEY, 6);

    // timestamp loop
    do {

        //List<String> inferredFiles = new ArrayList<>();

        // First of all run all the queries asynchronously and remember the jobId and filename for each term
        for (Entry<Term, Set<Triple>> entry : schemaTerms.entrySet()) {

            Term term = entry.getKey();

            // add table decoration to table name
            String query = GenericRule.getQuery(entry.getValue(), decoratedTable);

            log.info("\nQuery: " + query);

            String jobId = cloud.startBigDataQuery(query);
            String encodedTerm = FileUtils.encodeFilename(term.getTerm());
            String filename = Utils.TEMP_FOLDER + encodedTerm + Constants.DOT_TERMS;

            // remember the filename and the jobId for this query
            term.setFilename(filename).setJobId(jobId).setEncodedTerm(encodedTerm);

        }

        long start = System.currentTimeMillis();

        String inferredTriplesFile = Utils.TEMP_FOLDER + start + Constants.DOT_INF;

        List<String> productiveTerms = new ArrayList<>();

        int interimInferredTriples = 0;

        try (PrintWriter writer = new PrintWriter(
                new GZIPOutputStream(new FileOutputStream(inferredTriplesFile), Constants.GZIP_BUF_SIZE))) {

            // now loop through the queries
            for (Entry<Term, Set<Triple>> entry : schemaTerms.entrySet()) {

                Term term = entry.getKey();
                log.info("Reasoning for Term: " + term);

                Set<Triple> schemaTriples = entry.getValue();
                log.info("Schema Triples: " + Joiner.on('\n').join(schemaTriples));

                List<String> select = GenericRule.getSelect(schemaTriples);

                // block and wait for each job to complete then save results to a file
                BigInteger rows = BigInteger.ZERO;

                try {
                    rows = cloud.saveBigQueryResultsToFile(term.getJobId(), term.getFilename()).getTotalRows();

                } catch (IOException ioe) {
                    // transient backend errors
                    log.warn("failed to save query results to file, jobId: " + term.getJobId());
                }

                log.info("Query found " + rows + ", rows");

                // only process if triples are found matching this term
                if (!BigInteger.ZERO.equals(rows)) {

                    int inferredTriplesCount = this.inferAndSaveTriplesToFile(term, select, schemaTriples, rows,
                            decoratedTable, writer);

                    productiveTerms.add(term.getTerm());

                    interimInferredTriples += inferredTriplesCount;

                }
            }
        }

        totalInferredTriples += interimInferredTriples;

        if (interimInferredTriples > 0) {

            //TODO stream smaller numbers of inferred triples
            //TODO try uploading from cloud storage
            int streamingThreshold = Config.getIntegerProperty("ecarf.io.reasoning.streaming.threshold",
                    100000);

            log.info("Inserting " + interimInferredTriples + ", inferred triples into Big Data table for "
                    + productiveTerms.size() + " productive terms. Filename: " + inferredTriplesFile);

            if (interimInferredTriples <= streamingThreshold) {
                // stream the data

                Set<Triple> inferredTriples = TripleUtils.loadCompressedCSVTriples(inferredTriplesFile, false);
                log.info("Total triples to stream into Big Data: " + inferredTriples.size());
                cloud.streamObjectsIntoBigData(inferredTriples, TableUtils.getBigQueryTripleTable(table));

                log.info("All inferred triples are streamed into Big Data table");

            } else {

                // directly upload the data
                List<String> jobIds = cloud.loadLocalFilesIntoBigData(Lists.newArrayList(inferredTriplesFile),
                        TableUtils.getBigQueryTripleTable(table), false);
                log.info("All inferred triples are directly loaded into Big Data table, completed jobIds: "
                        + jobIds);
            }

            // reset empty retries
            emptyRetries = 0;

        } else {
            log.info("No new inferred triples");
            // increment empty retries
            emptyRetries++;
        }

        log.info("Total inferred triples so far = " + totalInferredTriples + ", current retry count: "
                + emptyRetries);

        ApiUtils.block(Config.getIntegerProperty(Constants.REASON_SLEEP_KEY, 20));

        // FIXME move into the particular cloud implementation service
        long elapsed = System.currentTimeMillis() - start;
        decoratedTable = "[" + table + "@-" + elapsed + "-]";

        log.info("Using table decorator: " + decoratedTable + ". Empty retries count: " + emptyRetries);

    } while (!(emptyRetries == maxRetries)); // end timestamp loop

    log.info("Finished reasoning, total inferred triples = " + totalInferredTriples);
}

From source file:io.ecarf.core.cloud.task.processor.reason.phase0.DoReasonTask4.java

@Override
public void run() throws IOException {

    GoogleCloudService cloud = (GoogleCloudService) this.getCloudService();

    //String table = metadata.getValue(EcarfMetaData.ECARF_TABLE);
    //Set<String> terms = metadata.getTerms();
    //String schemaFile = metadata.getValue(EcarfMetaData.ECARF_SCHEMA);
    //String bucket = metadata.getBucket();
    Set<String> termsSet;

    if (terms == null) {
        // too large, probably saved as a file
        //String termsFile = metadata.getValue(EcarfMetaData.ECARF_TERMS_FILE);
        log.info("Using json file for terms: " + termsFile);
        Validate.notNull(termsFile);/*from   w w  w  .  j ava 2s.c  o  m*/

        String localTermsFile = Utils.TEMP_FOLDER + termsFile;
        cloud.downloadObjectFromCloudStorage(termsFile, localTermsFile, bucket);

        // convert from JSON
        termsSet = FileUtils.jsonFileToSet(localTermsFile);

    } else {
        termsSet = ObjectUtils.csvToSet(terms);
    }

    String localSchemaFile = Utils.TEMP_FOLDER + schemaFile;
    // download the file from the cloud storage
    cloud.downloadObjectFromCloudStorage(schemaFile, localSchemaFile, bucket);

    // uncompress if compressed
    if (GzipUtils.isCompressedFilename(schemaFile)) {
        localSchemaFile = GzipUtils.getUncompressedFilename(localSchemaFile);
    }

    Map<String, Set<Triple>> allSchemaTriples = TripleUtils.getRelevantSchemaNTriples(localSchemaFile,
            TermUtils.RDFS_TBOX);

    // get all the triples we care about
    Map<Term, Set<Triple>> schemaTerms = new HashMap<>();

    for (String term : termsSet) {
        if (allSchemaTriples.containsKey(term)) {
            schemaTerms.put(new Term(term), allSchemaTriples.get(term));
        }
    }

    String decoratedTable = table;
    int emptyRetries = 0;
    int totalInferredTriples = 0;
    int maxRetries = Config.getIntegerProperty(Constants.REASON_RETRY_KEY, 6);
    String instanceId = cloud.getInstanceId();

    // timestamp loop
    do {

        //List<String> inferredFiles = new ArrayList<>();

        // First of all run all the queries asynchronously and remember the jobId and filename for each term
        for (Entry<Term, Set<Triple>> entry : schemaTerms.entrySet()) {

            Term term = entry.getKey();

            // add table decoration to table name
            String query = GenericRule.getQuery(entry.getValue(), decoratedTable);

            log.info("\nQuery: " + query);

            String jobId = cloud.startBigDataQuery(query);
            String encodedTerm = FileUtils.encodeFilename(term.getTerm());
            String filename = Utils.TEMP_FOLDER + encodedTerm + Constants.DOT_TERMS;

            // remember the filename and the jobId for this query
            term.setFilename(filename).setJobId(jobId).setEncodedTerm(encodedTerm);

        }

        long start = System.currentTimeMillis();

        String inferredTriplesFile = Utils.TEMP_FOLDER + instanceId + '_' + start + Constants.DOT_INF;

        List<String> productiveTerms = new ArrayList<>();

        int interimInferredTriples = 0;

        try (PrintWriter writer = new PrintWriter(
                new GZIPOutputStream(new FileOutputStream(inferredTriplesFile), Constants.GZIP_BUF_SIZE))) {

            // now loop through the queries
            for (Entry<Term, Set<Triple>> entry : schemaTerms.entrySet()) {

                Term term = entry.getKey();
                log.info("Reasoning for Term: " + term);

                Set<Triple> schemaTriples = entry.getValue();
                log.info("Schema Triples: " + Joiner.on('\n').join(schemaTriples));

                List<String> select = GenericRule.getSelect(schemaTriples);

                // block and wait for each job to complete then save results to a file
                BigInteger rows = BigInteger.ZERO;

                try {
                    rows = cloud.saveBigQueryResultsToFile(term.getJobId(), term.getFilename()).getTotalRows();

                } catch (IOException ioe) {
                    // transient backend errors
                    log.warn("failed to save query results to file, jobId: " + term.getJobId());
                }

                log.info("Query found " + rows + ", rows");

                // only process if triples are found matching this term
                if (!BigInteger.ZERO.equals(rows)) {

                    int inferredTriplesCount = this.inferAndSaveTriplesToFile(term, select, schemaTriples, rows,
                            decoratedTable, writer);

                    productiveTerms.add(term.getTerm());

                    interimInferredTriples += inferredTriplesCount;

                    this.totalRows = this.totalRows.add(rows);

                }
            }
        }

        totalInferredTriples += interimInferredTriples;

        if (interimInferredTriples > 0) {

            //TODO stream smaller numbers of inferred triples
            //TODO try uploading from cloud storage
            int streamingThreshold = Config.getIntegerProperty("ecarf.io.reasoning.streaming.threshold",
                    100000);

            log.info("Inserting " + interimInferredTriples + ", inferred triples into Big Data table for "
                    + productiveTerms.size() + " productive terms. Filename: " + inferredTriplesFile);

            if (interimInferredTriples <= streamingThreshold) {
                // stream the data

                Set<Triple> inferredTriples = TripleUtils.loadCompressedCSVTriples(inferredTriplesFile, false);
                log.info("Total triples to stream into Big Data: " + inferredTriples.size());
                cloud.streamObjectsIntoBigData(inferredTriples, TableUtils.getBigQueryTripleTable(table));

                log.info("All inferred triples are streamed into Big Data table");

            } else {

                // load the data through cloud storage
                // upload the file to cloud storage
                log.info("Uploading inferred triples file into cloud storage: " + inferredTriplesFile);
                StorageObject file = cloud.uploadFileToCloudStorage(inferredTriplesFile, bucket);
                log.info("File " + file + ", uploaded successfully. Now loading it into big data.");

                String jobId = cloud.loadCloudStorageFilesIntoBigData(Lists.newArrayList(file.getUri()),
                        TableUtils.getBigQueryTripleTable(table), false);
                log.info(
                        "All inferred triples are loaded into Big Data table through cloud storage, completed jobId: "
                                + jobId);

            }

            // reset empty retries
            emptyRetries = 0;

        } else {
            log.info("No new inferred triples");
            // increment empty retries
            emptyRetries++;
        }

        log.info("Total inferred triples so far = " + totalInferredTriples + ", current retry count: "
                + emptyRetries);

        ApiUtils.block(Config.getIntegerProperty(Constants.REASON_SLEEP_KEY, 20));

        // FIXME move into the particular cloud implementation service
        long elapsed = System.currentTimeMillis() - start;
        decoratedTable = "[" + table + "@-" + elapsed + "-]";

        log.info("Using table decorator: " + decoratedTable + ". Empty retries count: " + emptyRetries);

    } while (!(emptyRetries == maxRetries)); // end timestamp loop

    log.info("Finished reasoning, total inferred triples = " + totalInferredTriples);
    log.info("Number of avoided duplicate terms = " + this.duplicates);
    log.info("Total rows retrieved from big data = " + this.totalRows);
}

From source file:io.ecarf.core.cloud.task.processor.reason.phase2.DoReasonTask7.java

@Override
public void run() throws IOException {

    GoogleCloudService cloud = (GoogleCloudService) this.getCloudService();

    Stopwatch stopwatch1 = Stopwatch.createUnstarted();
    Stopwatch stopwatch2 = Stopwatch.createUnstarted();
    Set<String> termsSet;

    if (terms == null) {
        // too large, probably saved as a file

        log.info("Using json file for terms: " + termsFile);
        Validate.notNull(termsFile);//from w  w w  . j a va2  s. c  o  m

        String localTermsFile = Utils.TEMP_FOLDER + termsFile;
        cloud.downloadObjectFromCloudStorage(termsFile, localTermsFile, bucket);

        // convert from JSON
        termsSet = io.cloudex.framework.utils.FileUtils.jsonFileToSet(localTermsFile);

    } else {
        termsSet = ObjectUtils.csvToSet(terms);
    }

    String localSchemaFile = Utils.TEMP_FOLDER + schemaFile;
    // download the file from the cloud storage
    cloud.downloadObjectFromCloudStorage(schemaFile, localSchemaFile, bucket);

    // uncompress if compressed
    if (GzipUtils.isCompressedFilename(schemaFile)) {
        localSchemaFile = GzipUtils.getUncompressedFilename(localSchemaFile);
    }

    Map<Long, Set<Triple>> allSchemaTriples = TripleUtils.getRelevantSchemaETriples(localSchemaFile,
            TermUtils.RDFS_TBOX);

    // get all the triples we care about
    schemaTerms = new HashMap<>();

    for (String termStr : termsSet) {

        Long term = Long.parseLong(termStr);

        if (allSchemaTriples.containsKey(term)) {
            schemaTerms.put(term, allSchemaTriples.get(term));
        }
    }

    String decoratedTable = table;
    int emptyRetries = 0;
    int totalInferredTriples = 0;
    int maxRetries = Config.getIntegerProperty(Constants.REASON_RETRY_KEY, 6);
    String instanceId = cloud.getInstanceId();

    QueryGenerator<Long> generator = new QueryGenerator<Long>(schemaTerms, null);

    // timestamp loop
    do {

        Set<Long> productiveTerms = new HashSet<>();
        int interimInferredTriples = 0;

        // First of all run all the queries asynchronously and remember the jobId and filename for each term

        List<QueryResult> queryResults = new ArrayList<QueryResult>();
        generator.setDecoratedTable(decoratedTable);

        List<String> queries = generator.getQueries();
        log.debug("Generated Queries: " + queries);
        String queryResultFilePrefix = Utils.TEMP_FOLDER + instanceId + '_' + System.currentTimeMillis()
                + "_QueryResults_";
        int fileCount = 0;
        for (String query : queries) {
            String jobId = cloud.startBigDataQuery(query);
            queryResults
                    .add(QueryResult.create().setFilename(queryResultFilePrefix + fileCount).setJobId(jobId));
            fileCount++;
        }

        // invoke all the queries in parallel
        //this.invokeAll(queryTasks);

        long start = System.currentTimeMillis();

        String inferredTriplesFile = Utils.TEMP_FOLDER + instanceId + '_' + start + Constants.DOT_INF;

        // save all the query results in files in parallel
        //this.invokeAll(saveTasks);

        for (QueryResult queryResult : queryResults) {
            try {
                // block and wait for each job to complete then save results to a file
                QueryStats stats = cloud.saveBigQueryResultsToFile(queryResult.getJobId(),
                        queryResult.getFilename());
                queryResult.setStats(stats);

            } catch (IOException ioe) {
                // transient backend errors
                log.warn("failed to save query results to file, jobId: " + queryResult.getJobId(), ioe);
                //TODO should throw an exception
            }
        }

        try (PrintWriter writer = new PrintWriter(
                new GZIPOutputStream(new FileOutputStream(inferredTriplesFile), Constants.GZIP_BUF_SIZE))) {

            // now loop through the queries
            //for(Entry<Term, Set<Triple>> entry: schemaTerms.entrySet()) {
            for (QueryResult queryResult : queryResults) {

                //Term term = entry.getKey();
                QueryStats stats = queryResult.getStats();

                BigInteger rows = stats.getTotalRows();//term.getRows();

                this.totalBytes = this.totalBytes + stats.getTotalProcessedBytes();//term.getBytes();

                // only process if triples are found matching this term
                if (!BigInteger.ZERO.equals(rows)) {

                    stopwatch1.start();

                    int inferredTriplesCount = this.inferAndSaveTriplesToFile(queryResult, productiveTerms,
                            decoratedTable, writer);

                    interimInferredTriples += inferredTriplesCount;

                    this.totalRows = this.totalRows.add(rows);

                    stopwatch1.stop();

                } else {
                    log.info("Skipping query as no data is found");
                }
            }
        }

        totalInferredTriples += interimInferredTriples;

        if (interimInferredTriples > 0) {

            // stream smaller numbers of inferred triples
            // try uploading from cloud storage
            int streamingThreshold = Config.getIntegerProperty("ecarf.io.reasoning.streaming.threshold",
                    100000);

            log.info("Inserting " + interimInferredTriples + ", inferred triples into Big Data table for "
                    + productiveTerms.size() + " productive terms. Filename: " + inferredTriplesFile);

            if (interimInferredTriples <= streamingThreshold) {
                // stream the data

                Set<Triple> inferredTriples = TripleUtils.loadCompressedCSVTriples(inferredTriplesFile, true);
                log.info("Total triples to stream into Big Data: " + inferredTriples.size());
                cloud.streamObjectsIntoBigData(inferredTriples,
                        TableUtils.getBigQueryEncodedTripleTable(table));

                log.info("All inferred triples are streamed into Big Data table");

            } else {

                // load the data through cloud storage
                // upload the file to cloud storage
                log.info("Uploading inferred triples file into cloud storage: " + inferredTriplesFile);
                StorageObject file = cloud.uploadFileToCloudStorage(inferredTriplesFile, bucket);
                log.info("File " + file + ", uploaded successfully. Now loading it into big data.");

                String jobId = cloud.loadCloudStorageFilesIntoBigData(Lists.newArrayList(file.getUri()),
                        TableUtils.getBigQueryEncodedTripleTable(table), false);
                log.info(
                        "All inferred triples are loaded into Big Data table through cloud storage, completed jobId: "
                                + jobId);

            }

            // reset empty retries
            emptyRetries = 0;

            stopwatch2.reset();

        } else {
            log.info("No new inferred triples");
            // increment empty retries
            emptyRetries++;

            if (!stopwatch2.isRunning()) {
                stopwatch2.start();
            }
        }

        log.info("Total inferred triples so far = " + totalInferredTriples + ", current retry count: "
                + emptyRetries);

        if (emptyRetries < maxRetries) {
            ApiUtils.block(Config.getIntegerProperty(Constants.REASON_SLEEP_KEY, 20));

            // FIXME move into the particular cloud implementation service
            long elapsed = System.currentTimeMillis() - start;
            decoratedTable = "[" + table + "@-" + elapsed + "-]";

            log.info("Using table decorator: " + decoratedTable + ". Empty retries count: " + emptyRetries);
        }

    } while (emptyRetries < maxRetries); // end timestamp loop

    //executor.shutdown();
    log.info("Finished reasoning, total inferred triples = " + totalInferredTriples);
    //log.info("Number of avoided duplicate terms = " + this.duplicates);
    log.info("Total rows retrieved from big data = " + this.totalRows);
    log.info("Total processed GBytes = " + ((double) this.totalBytes / FileUtils.ONE_GB));
    log.info("Total process reasoning time (serialization in inf file) = " + stopwatch1);
    log.info("Total time spent in empty inference cycles = " + stopwatch2);
}

From source file:io.cloudex.framework.utils.FileUtils.java

/**
 * (Gzip) Uncompress a compressed file/*  w w  w  .j ava 2s .  c o  m*/
 * @param filename
 * @return
 * @throws IOException
 */
public String unCompressFile(String filename) throws IOException {
    FileInputStream fin = new FileInputStream(filename);
    BufferedInputStream in = new BufferedInputStream(fin);
    String outFile = GzipUtils.getUncompressedFilename(filename);
    try (FileOutputStream out = new FileOutputStream(outFile);
            GzipCompressorInputStream gzIn = new GzipCompressorInputStream(in)) {
        final byte[] buffer = new byte[BUFFER];
        int n = 0;
        while (-1 != (n = gzIn.read(buffer))) {
            out.write(buffer, 0, n);
        }
    }
    return outFile;
}

From source file:io.ecarf.core.cloud.task.processor.reason.phase2.DoReasonTask6.java

@Override
public void run() throws IOException {

    GoogleCloudService cloud = (GoogleCloudService) this.getCloudService();

    //String table = metadata.getValue(EcarfMetaData.ECARF_TABLE);
    //Set<String> terms = metadata.getTerms();
    //String schemaFile = metadata.getValue(EcarfMetaData.ECARF_SCHEMA);
    //String bucket = metadata.getBucket();
    Stopwatch stopwatch1 = Stopwatch.createUnstarted();
    Stopwatch stopwatch2 = Stopwatch.createUnstarted();
    Set<String> termsSet;

    if (terms == null) {
        // too large, probably saved as a file
        //String termsFile = metadata.getValue(EcarfMetaData.ECARF_TERMS_FILE);
        log.info("Using json file for terms: " + termsFile);
        Validate.notNull(termsFile);//from  w w  w . j  a va  2 s  .  c om

        String localTermsFile = Utils.TEMP_FOLDER + termsFile;
        cloud.downloadObjectFromCloudStorage(termsFile, localTermsFile, bucket);

        // convert from JSON
        termsSet = io.cloudex.framework.utils.FileUtils.jsonFileToSet(localTermsFile);

    } else {
        termsSet = ObjectUtils.csvToSet(terms);
    }

    String localSchemaFile = Utils.TEMP_FOLDER + schemaFile;
    // download the file from the cloud storage
    cloud.downloadObjectFromCloudStorage(schemaFile, localSchemaFile, bucket);

    // uncompress if compressed
    if (GzipUtils.isCompressedFilename(schemaFile)) {
        localSchemaFile = GzipUtils.getUncompressedFilename(localSchemaFile);
    }

    Map<String, Set<Triple>> allSchemaTriples = TripleUtils.getRelevantSchemaNTriples(localSchemaFile,
            TermUtils.RDFS_TBOX);

    // get all the triples we care about
    schemaTerms = new HashMap<>();

    for (String term : termsSet) {
        if (allSchemaTriples.containsKey(term)) {
            schemaTerms.put(term, allSchemaTriples.get(term));
        }
    }

    String decoratedTable = table;
    int emptyRetries = 0;
    int totalInferredTriples = 0;
    int maxRetries = Config.getIntegerProperty(Constants.REASON_RETRY_KEY, 6);
    String instanceId = cloud.getInstanceId();

    QueryGenerator<String> generator = new QueryGenerator<String>(schemaTerms, null);

    // timestamp loop
    do {

        Set<String> productiveTerms = new HashSet<>();
        int interimInferredTriples = 0;

        // First of all run all the queries asynchronously and remember the jobId and filename for each term

        List<QueryResult> queryResults = new ArrayList<QueryResult>();
        generator.setDecoratedTable(decoratedTable);

        List<String> queries = generator.getQueries();
        log.debug("Generated Queries: " + queries);
        String queryResultFilePrefix = Utils.TEMP_FOLDER + instanceId + '_' + System.currentTimeMillis()
                + "_QueryResults_";
        int fileCount = 0;
        for (String query : queries) {
            String jobId = cloud.startBigDataQuery(query);
            queryResults
                    .add(QueryResult.create().setFilename(queryResultFilePrefix + fileCount).setJobId(jobId));
            fileCount++;
        }

        // invoke all the queries in parallel
        //this.invokeAll(queryTasks);

        long start = System.currentTimeMillis();

        String inferredTriplesFile = Utils.TEMP_FOLDER + instanceId + '_' + start + Constants.DOT_INF;

        // save all the query results in files in parallel
        //this.invokeAll(saveTasks);

        for (QueryResult queryResult : queryResults) {
            try {
                // block and wait for each job to complete then save results to a file
                QueryStats stats = cloud.saveBigQueryResultsToFile(queryResult.getJobId(),
                        queryResult.getFilename());
                queryResult.setStats(stats);

            } catch (IOException ioe) {
                // transient backend errors
                log.warn("failed to save query results to file, jobId: " + queryResult.getJobId(), ioe);
                //TODO should throw an exception
            }
        }

        try (PrintWriter writer = new PrintWriter(
                new GZIPOutputStream(new FileOutputStream(inferredTriplesFile), Constants.GZIP_BUF_SIZE))) {

            // now loop through the queries
            //for(Entry<Term, Set<Triple>> entry: schemaTerms.entrySet()) {
            for (QueryResult queryResult : queryResults) {

                //Term term = entry.getKey();
                QueryStats stats = queryResult.getStats();

                BigInteger rows = stats.getTotalRows();//term.getRows();

                this.totalBytes = this.totalBytes + stats.getTotalProcessedBytes();//term.getBytes();

                // only process if triples are found matching this term
                if (!BigInteger.ZERO.equals(rows)) {

                    stopwatch1.start();

                    int inferredTriplesCount = this.inferAndSaveTriplesToFile(queryResult, productiveTerms,
                            decoratedTable, writer);

                    interimInferredTriples += inferredTriplesCount;

                    this.totalRows = this.totalRows.add(rows);

                    stopwatch1.stop();

                } else {
                    log.info("Skipping query as no data is found");
                }
            }
        }

        totalInferredTriples += interimInferredTriples;

        if (interimInferredTriples > 0) {

            // stream smaller numbers of inferred triples
            // try uploading from cloud storage
            int streamingThreshold = Config.getIntegerProperty("ecarf.io.reasoning.streaming.threshold",
                    100000);

            log.info("Inserting " + interimInferredTriples + ", inferred triples into Big Data table for "
                    + productiveTerms.size() + " productive terms. Filename: " + inferredTriplesFile);

            if (interimInferredTriples <= streamingThreshold) {
                // stream the data

                Set<Triple> inferredTriples = TripleUtils.loadCompressedCSVTriples(inferredTriplesFile, false);
                log.info("Total triples to stream into Big Data: " + inferredTriples.size());
                cloud.streamObjectsIntoBigData(inferredTriples, TableUtils.getBigQueryTripleTable(table));

                log.info("All inferred triples are streamed into Big Data table");

            } else {

                // load the data through cloud storage
                // upload the file to cloud storage
                log.info("Uploading inferred triples file into cloud storage: " + inferredTriplesFile);
                StorageObject file = cloud.uploadFileToCloudStorage(inferredTriplesFile, bucket);
                log.info("File " + file + ", uploaded successfully. Now loading it into big data.");

                String jobId = cloud.loadCloudStorageFilesIntoBigData(Lists.newArrayList(file.getUri()),
                        TableUtils.getBigQueryTripleTable(table), false);
                log.info(
                        "All inferred triples are loaded into Big Data table through cloud storage, completed jobId: "
                                + jobId);

            }

            // reset empty retries
            emptyRetries = 0;

            stopwatch2.reset();

        } else {
            log.info("No new inferred triples");
            // increment empty retries
            emptyRetries++;

            if (!stopwatch2.isRunning()) {
                stopwatch2.start();
            }
        }

        log.info("Total inferred triples so far = " + totalInferredTriples + ", current retry count: "
                + emptyRetries);

        if (emptyRetries < maxRetries) {
            ApiUtils.block(Config.getIntegerProperty(Constants.REASON_SLEEP_KEY, 20));

            // FIXME move into the particular cloud implementation service
            long elapsed = System.currentTimeMillis() - start;
            decoratedTable = "[" + table + "@-" + elapsed + "-]";

            log.info("Using table decorator: " + decoratedTable + ". Empty retries count: " + emptyRetries);
        }

    } while (emptyRetries < maxRetries); // end timestamp loop

    //executor.shutdown();
    log.info("Finished reasoning, total inferred triples = " + totalInferredTriples);
    log.info("Number of avoided duplicate terms = " + this.duplicates);
    log.info("Total rows retrieved from big data = " + this.totalRows);
    log.info("Total processed GBytes = " + ((double) this.totalBytes / FileUtils.ONE_GB));
    log.info("Total process reasoning time (serialization in inf file) = " + stopwatch1);
    log.info("Total time spent in empty inference cycles = " + stopwatch2);
}