Example usage for org.apache.commons.io FileUtils ONE_GB

List of usage examples for org.apache.commons.io FileUtils ONE_GB

Introduction

In this page you can find the example usage for org.apache.commons.io FileUtils ONE_GB.

Prototype

long ONE_GB

To view the source code for org.apache.commons.io FileUtils ONE_GB.

Click Source Link

Document

The number of bytes in a gigabyte.

Usage

From source file:io.ecarf.core.cloud.task.processor.reason.phase2.DoReasonTask9.java

@Override
public void run() throws IOException {

    GoogleCloudService cloud = (GoogleCloudService) this.getCloudService();

    Stopwatch stopwatch1 = Stopwatch.createUnstarted();
    Stopwatch stopwatch2 = Stopwatch.createUnstarted();

    this.setup(cloud);

    String decoratedTable = table;
    int emptyRetries = 0;
    int totalInferredTriples = 0;

    int maxRetries;
    if (this.retries == null) {
        maxRetries = Config.getIntegerProperty(Constants.REASON_RETRY_KEY, 6);

    } else {//from  www. j  ava2  s.co m
        maxRetries = this.retries;
    }

    int cycleSleep;
    if (this.sleep == null) {
        cycleSleep = Config.getIntegerProperty(Constants.REASON_SLEEP_KEY, 20);
    } else {

        cycleSleep = this.sleep;
    }

    this.ddLimit = Config.getIntegerProperty(Constants.REASON_DATA_DIRECT_DOWNLOAD_LIMIT, 1_200_000);
    int streamingThreshold = Config.getIntegerProperty("ecarf.io.reasoning.streaming.threshold", 100000);
    String instanceId = cloud.getInstanceId();

    int processors = Runtime.getRuntime().availableProcessors();

    if (processors > 1) {
        this.executor = Utils.createFixedThreadPool(processors);
    }

    int count = 0;

    QueryGenerator<Long> generator = new QueryGenerator<Long>(schemaTerms, null);

    // timestamp loop
    do {

        // First of all run all the queries asynchronously and remember the jobId and filename for each term
        generator.setDecoratedTable(decoratedTable);

        String query = generator.getQuery();
        log.debug("Generated Query: " + query);

        String queryResultFilePrefix = instanceId + "_QueryResults_" + count;

        String jobId = cloud.startBigDataQuery(query, new BigDataTable(this.table));
        //QueryResult   queryResult = QueryResult.create().setFilename(queryResultFilePrefix).setJobId(jobId);

        long start = System.currentTimeMillis();

        // block and wait for each job to complete then save results to a file
        QueryStats stats = cloud.saveBigQueryResultsToFile(jobId, queryResultFilePrefix, this.bucket,
                processors, this.ddLimit);

        BigInteger rows = stats.getTotalRows();

        this.totalBytes = this.totalBytes + stats.getTotalProcessedBytes();

        Set<Long> productiveTerms = new HashSet<>();
        Set<String> inferredTriplesFiles = new HashSet<>();
        int interimInferredTriples = 0;

        // only process if triples are found matching this term
        if ((rows != null) && !BigInteger.ZERO.equals(rows)) {

            stopwatch1.start();

            interimInferredTriples = this.inferAndSaveTriplesToFile(stats, productiveTerms, processors,
                    inferredTriplesFiles);

            this.totalRows = this.totalRows.add(rows);

            stopwatch1.stop();

        } else {
            log.info("Skipping query as no data is found");
        }

        totalInferredTriples += interimInferredTriples;

        if (interimInferredTriples > 0) {

            // stream smaller numbers of inferred triples
            // try uploading from cloud storage

            log.info("Inserting " + interimInferredTriples + ", inferred triples into Big Data table for "
                    + productiveTerms.size() + " productive terms. Filename: " + inferredTriplesFiles);

            if (interimInferredTriples <= streamingThreshold) {
                // stream the data

                Set<Triple> inferredTriples = new HashSet<>();
                for (String inferredTriplesFile : inferredTriplesFiles) {
                    TripleUtils.loadCompressedCSVTriples(inferredTriplesFile, true, inferredTriples);
                }

                log.info("Total triples to stream into Big Data: " + inferredTriples.size());
                cloud.streamObjectsIntoBigData(inferredTriples,
                        TableUtils.getBigQueryEncodedTripleTable(table));

                log.info("All inferred triples are streamed into Big Data table");

            } else {

                List<String> cloudStorageFiles = new ArrayList<>();
                // load the data through cloud storage
                // upload the file to cloud storage
                for (String inferredTriplesFile : inferredTriplesFiles) {
                    log.info("Uploading inferred triples file into cloud storage: " + inferredTriplesFile);
                    StorageObject file = cloud.uploadFileToCloudStorage(inferredTriplesFile, bucket);
                    log.info("File " + file + ", uploaded successfully. Now loading it into big data.");
                    cloudStorageFiles.add(file.getUri());
                }

                jobId = cloud.loadCloudStorageFilesIntoBigData(cloudStorageFiles,
                        TableUtils.getBigQueryEncodedTripleTable(table), false);

                log.info(
                        "All inferred triples are loaded into Big Data table through cloud storage, completed jobId: "
                                + jobId);

            }

            // reset empty retries
            emptyRetries = 0;

            stopwatch2.reset();

        } else {
            log.info("No new inferred triples");
            // increment empty retries
            emptyRetries++;

            if (!stopwatch2.isRunning()) {
                stopwatch2.start();
            }
        }

        log.info("Total inferred triples so far = " + totalInferredTriples + ", current retry count: "
                + emptyRetries);

        if (emptyRetries < maxRetries) {
            ApiUtils.block(cycleSleep);

            // FIXME move into the particular cloud implementation service
            long elapsed = System.currentTimeMillis() - start;
            decoratedTable = "[" + table + "@-" + elapsed + "-]";

            log.info("Using table decorator: " + decoratedTable + ". Empty retries count: " + emptyRetries);
        }

        count++;

    } while (emptyRetries < maxRetries); // end timestamp loop

    executor.shutdown();
    log.info("Finished reasoning, total inferred triples = " + totalInferredTriples);
    //log.info("Number of avoided duplicate terms = " + this.duplicates);
    log.info("Total rows retrieved from big data = " + this.totalRows);
    log.info("Total processed GBytes = " + ((double) this.totalBytes / FileUtils.ONE_GB));
    log.info("Total process reasoning time (serialization in inf file) = " + stopwatch1);
    log.info("Total time spent in empty inference cycles = " + stopwatch2);
}

From source file:io.ecarf.core.cloud.task.processor.reason.phase1.DoReasonTask5.java

@Override
public void run() throws IOException {

    GoogleCloudService cloud = (GoogleCloudService) this.getCloudService();

    //String table = metadata.getValue(EcarfMetaData.ECARF_TABLE);
    //Set<String> terms = metadata.getTerms();
    //String schemaFile = metadata.getValue(EcarfMetaData.ECARF_SCHEMA);
    //String bucket = metadata.getBucket();
    Stopwatch stopwatch1 = Stopwatch.createUnstarted();
    Stopwatch stopwatch2 = Stopwatch.createUnstarted();
    Set<String> termsSet;

    if (terms == null) {
        // too large, probably saved as a file
        //String termsFile = metadata.getValue(EcarfMetaData.ECARF_TERMS_FILE);
        log.info("Using json file for terms: " + termsFile);
        Validate.notNull(termsFile);/*w  w w.j  a  v  a2s.co m*/

        String localTermsFile = Utils.TEMP_FOLDER + termsFile;
        cloud.downloadObjectFromCloudStorage(termsFile, localTermsFile, bucket);

        // convert from JSON
        termsSet = io.cloudex.framework.utils.FileUtils.jsonFileToSet(localTermsFile);

    } else {
        termsSet = ObjectUtils.csvToSet(terms);
    }

    String localSchemaFile = Utils.TEMP_FOLDER + schemaFile;
    // download the file from the cloud storage
    cloud.downloadObjectFromCloudStorage(schemaFile, localSchemaFile, bucket);

    // uncompress if compressed
    if (GzipUtils.isCompressedFilename(schemaFile)) {
        localSchemaFile = GzipUtils.getUncompressedFilename(localSchemaFile);
    }

    Map<String, Set<Triple>> allSchemaTriples = TripleUtils.getRelevantSchemaNTriples(localSchemaFile,
            TermUtils.RDFS_TBOX);

    // get all the triples we care about
    Map<Term, Set<Triple>> schemaTerms = new HashMap<>();

    for (String term : termsSet) {
        if (allSchemaTriples.containsKey(term)) {
            schemaTerms.put(new Term(term), allSchemaTriples.get(term));
        }
    }

    String decoratedTable = table;
    int emptyRetries = 0;
    int totalInferredTriples = 0;
    int maxRetries = Config.getIntegerProperty(Constants.REASON_RETRY_KEY, 6);
    String instanceId = cloud.getInstanceId();

    // timestamp loop
    do {

        List<String> productiveTerms = new ArrayList<>();
        int interimInferredTriples = 0;

        // First of all run all the queries asynchronously and remember the jobId and filename for each term
        List<Callable<Void>> queryTasks = new ArrayList<>();
        List<Callable<Void>> saveTasks = new ArrayList<>();

        for (Entry<Term, Set<Triple>> entry : schemaTerms.entrySet()) {

            Term term = entry.getKey();
            Set<Triple> triples = entry.getValue();

            QuerySubTask queryTask = new QuerySubTask(term, triples, decoratedTable, cloud);
            queryTasks.add(queryTask);

            SaveResultsSubTask saveTask = new SaveResultsSubTask(term, cloud);
            saveTasks.add(saveTask);
        }

        // invoke all the queries in parallel
        this.invokeAll(queryTasks);

        long start = System.currentTimeMillis();

        String inferredTriplesFile = Utils.TEMP_FOLDER + instanceId + '_' + start + Constants.DOT_INF;

        // save all the query results in files in parallel
        this.invokeAll(saveTasks);

        try (PrintWriter writer = new PrintWriter(
                new GZIPOutputStream(new FileOutputStream(inferredTriplesFile), Constants.GZIP_BUF_SIZE))) {

            // now loop through the queries
            for (Entry<Term, Set<Triple>> entry : schemaTerms.entrySet()) {

                Term term = entry.getKey();

                BigInteger rows = term.getRows();

                this.totalBytes = this.totalBytes + term.getBytes();

                // only process if triples are found matching this term
                if (!BigInteger.ZERO.equals(rows)) {

                    stopwatch1.start();

                    log.info("Reasoning for Term: " + term);

                    Set<Triple> schemaTriples = entry.getValue();
                    log.info("Schema Triples: " + Joiner.on('\n').join(schemaTriples));

                    List<String> select = GenericRule.getSelect(schemaTriples);

                    int inferredTriplesCount = this.inferAndSaveTriplesToFile(term, select, schemaTriples, rows,
                            decoratedTable, writer);

                    productiveTerms.add(term.getTerm());

                    interimInferredTriples += inferredTriplesCount;

                    this.totalRows = this.totalRows.add(rows);

                    stopwatch1.stop();

                } else {
                    log.info("Skipping term as no data found: " + term);
                }
            }
        }

        totalInferredTriples += interimInferredTriples;

        if (interimInferredTriples > 0) {

            // stream smaller numbers of inferred triples
            // try uploading from cloud storage
            int streamingThreshold = Config.getIntegerProperty("ecarf.io.reasoning.streaming.threshold",
                    100000);

            log.info("Inserting " + interimInferredTriples + ", inferred triples into Big Data table for "
                    + productiveTerms.size() + " productive terms. Filename: " + inferredTriplesFile);

            if (interimInferredTriples <= streamingThreshold) {
                // stream the data

                Set<Triple> inferredTriples = TripleUtils.loadCompressedCSVTriples(inferredTriplesFile, false);
                log.info("Total triples to stream into Big Data: " + inferredTriples.size());
                cloud.streamObjectsIntoBigData(inferredTriples, TableUtils.getBigQueryTripleTable(table));

                log.info("All inferred triples are streamed into Big Data table");

            } else {

                // load the data through cloud storage
                // upload the file to cloud storage
                log.info("Uploading inferred triples file into cloud storage: " + inferredTriplesFile);
                StorageObject file = cloud.uploadFileToCloudStorage(inferredTriplesFile, bucket);
                log.info("File " + file + ", uploaded successfully. Now loading it into big data.");

                String jobId = cloud.loadCloudStorageFilesIntoBigData(Lists.newArrayList(file.getUri()),
                        TableUtils.getBigQueryTripleTable(table), false);
                log.info(
                        "All inferred triples are loaded into Big Data table through cloud storage, completed jobId: "
                                + jobId);

            }

            // reset empty retries
            emptyRetries = 0;

            stopwatch2.reset();

        } else {
            log.info("No new inferred triples");
            // increment empty retries
            emptyRetries++;

            if (!stopwatch2.isRunning()) {
                stopwatch2.start();
            }
        }

        log.info("Total inferred triples so far = " + totalInferredTriples + ", current retry count: "
                + emptyRetries);

        if (emptyRetries < maxRetries) {
            ApiUtils.block(Config.getIntegerProperty(Constants.REASON_SLEEP_KEY, 20));

            // FIXME move into the particular cloud implementation service
            long elapsed = System.currentTimeMillis() - start;
            decoratedTable = "[" + table + "@-" + elapsed + "-]";

            log.info("Using table decorator: " + decoratedTable + ". Empty retries count: " + emptyRetries);
        }

    } while (emptyRetries < maxRetries); // end timestamp loop

    executor.shutdown();
    log.info("Finished reasoning, total inferred triples = " + totalInferredTriples);
    log.info("Number of avoided duplicate terms = " + this.duplicates);
    log.info("Total rows retrieved from big data = " + this.totalRows);
    log.info("Total processed GBytes = " + ((double) this.totalBytes / FileUtils.ONE_GB));
    log.info("Total process reasoning time (serialization in inf file) = " + stopwatch1);
    log.info("Total time spent in empty inference cycles = " + stopwatch2);
}

From source file:io.ecarf.core.cloud.task.processor.reason.phase2.DoReasonTask7.java

@Override
public void run() throws IOException {

    GoogleCloudService cloud = (GoogleCloudService) this.getCloudService();

    Stopwatch stopwatch1 = Stopwatch.createUnstarted();
    Stopwatch stopwatch2 = Stopwatch.createUnstarted();
    Set<String> termsSet;

    if (terms == null) {
        // too large, probably saved as a file

        log.info("Using json file for terms: " + termsFile);
        Validate.notNull(termsFile);//from  w w  w.j a v a2s  . c  o m

        String localTermsFile = Utils.TEMP_FOLDER + termsFile;
        cloud.downloadObjectFromCloudStorage(termsFile, localTermsFile, bucket);

        // convert from JSON
        termsSet = io.cloudex.framework.utils.FileUtils.jsonFileToSet(localTermsFile);

    } else {
        termsSet = ObjectUtils.csvToSet(terms);
    }

    String localSchemaFile = Utils.TEMP_FOLDER + schemaFile;
    // download the file from the cloud storage
    cloud.downloadObjectFromCloudStorage(schemaFile, localSchemaFile, bucket);

    // uncompress if compressed
    if (GzipUtils.isCompressedFilename(schemaFile)) {
        localSchemaFile = GzipUtils.getUncompressedFilename(localSchemaFile);
    }

    Map<Long, Set<Triple>> allSchemaTriples = TripleUtils.getRelevantSchemaETriples(localSchemaFile,
            TermUtils.RDFS_TBOX);

    // get all the triples we care about
    schemaTerms = new HashMap<>();

    for (String termStr : termsSet) {

        Long term = Long.parseLong(termStr);

        if (allSchemaTriples.containsKey(term)) {
            schemaTerms.put(term, allSchemaTriples.get(term));
        }
    }

    String decoratedTable = table;
    int emptyRetries = 0;
    int totalInferredTriples = 0;
    int maxRetries = Config.getIntegerProperty(Constants.REASON_RETRY_KEY, 6);
    String instanceId = cloud.getInstanceId();

    QueryGenerator<Long> generator = new QueryGenerator<Long>(schemaTerms, null);

    // timestamp loop
    do {

        Set<Long> productiveTerms = new HashSet<>();
        int interimInferredTriples = 0;

        // First of all run all the queries asynchronously and remember the jobId and filename for each term

        List<QueryResult> queryResults = new ArrayList<QueryResult>();
        generator.setDecoratedTable(decoratedTable);

        List<String> queries = generator.getQueries();
        log.debug("Generated Queries: " + queries);
        String queryResultFilePrefix = Utils.TEMP_FOLDER + instanceId + '_' + System.currentTimeMillis()
                + "_QueryResults_";
        int fileCount = 0;
        for (String query : queries) {
            String jobId = cloud.startBigDataQuery(query);
            queryResults
                    .add(QueryResult.create().setFilename(queryResultFilePrefix + fileCount).setJobId(jobId));
            fileCount++;
        }

        // invoke all the queries in parallel
        //this.invokeAll(queryTasks);

        long start = System.currentTimeMillis();

        String inferredTriplesFile = Utils.TEMP_FOLDER + instanceId + '_' + start + Constants.DOT_INF;

        // save all the query results in files in parallel
        //this.invokeAll(saveTasks);

        for (QueryResult queryResult : queryResults) {
            try {
                // block and wait for each job to complete then save results to a file
                QueryStats stats = cloud.saveBigQueryResultsToFile(queryResult.getJobId(),
                        queryResult.getFilename());
                queryResult.setStats(stats);

            } catch (IOException ioe) {
                // transient backend errors
                log.warn("failed to save query results to file, jobId: " + queryResult.getJobId(), ioe);
                //TODO should throw an exception
            }
        }

        try (PrintWriter writer = new PrintWriter(
                new GZIPOutputStream(new FileOutputStream(inferredTriplesFile), Constants.GZIP_BUF_SIZE))) {

            // now loop through the queries
            //for(Entry<Term, Set<Triple>> entry: schemaTerms.entrySet()) {
            for (QueryResult queryResult : queryResults) {

                //Term term = entry.getKey();
                QueryStats stats = queryResult.getStats();

                BigInteger rows = stats.getTotalRows();//term.getRows();

                this.totalBytes = this.totalBytes + stats.getTotalProcessedBytes();//term.getBytes();

                // only process if triples are found matching this term
                if (!BigInteger.ZERO.equals(rows)) {

                    stopwatch1.start();

                    int inferredTriplesCount = this.inferAndSaveTriplesToFile(queryResult, productiveTerms,
                            decoratedTable, writer);

                    interimInferredTriples += inferredTriplesCount;

                    this.totalRows = this.totalRows.add(rows);

                    stopwatch1.stop();

                } else {
                    log.info("Skipping query as no data is found");
                }
            }
        }

        totalInferredTriples += interimInferredTriples;

        if (interimInferredTriples > 0) {

            // stream smaller numbers of inferred triples
            // try uploading from cloud storage
            int streamingThreshold = Config.getIntegerProperty("ecarf.io.reasoning.streaming.threshold",
                    100000);

            log.info("Inserting " + interimInferredTriples + ", inferred triples into Big Data table for "
                    + productiveTerms.size() + " productive terms. Filename: " + inferredTriplesFile);

            if (interimInferredTriples <= streamingThreshold) {
                // stream the data

                Set<Triple> inferredTriples = TripleUtils.loadCompressedCSVTriples(inferredTriplesFile, true);
                log.info("Total triples to stream into Big Data: " + inferredTriples.size());
                cloud.streamObjectsIntoBigData(inferredTriples,
                        TableUtils.getBigQueryEncodedTripleTable(table));

                log.info("All inferred triples are streamed into Big Data table");

            } else {

                // load the data through cloud storage
                // upload the file to cloud storage
                log.info("Uploading inferred triples file into cloud storage: " + inferredTriplesFile);
                StorageObject file = cloud.uploadFileToCloudStorage(inferredTriplesFile, bucket);
                log.info("File " + file + ", uploaded successfully. Now loading it into big data.");

                String jobId = cloud.loadCloudStorageFilesIntoBigData(Lists.newArrayList(file.getUri()),
                        TableUtils.getBigQueryEncodedTripleTable(table), false);
                log.info(
                        "All inferred triples are loaded into Big Data table through cloud storage, completed jobId: "
                                + jobId);

            }

            // reset empty retries
            emptyRetries = 0;

            stopwatch2.reset();

        } else {
            log.info("No new inferred triples");
            // increment empty retries
            emptyRetries++;

            if (!stopwatch2.isRunning()) {
                stopwatch2.start();
            }
        }

        log.info("Total inferred triples so far = " + totalInferredTriples + ", current retry count: "
                + emptyRetries);

        if (emptyRetries < maxRetries) {
            ApiUtils.block(Config.getIntegerProperty(Constants.REASON_SLEEP_KEY, 20));

            // FIXME move into the particular cloud implementation service
            long elapsed = System.currentTimeMillis() - start;
            decoratedTable = "[" + table + "@-" + elapsed + "-]";

            log.info("Using table decorator: " + decoratedTable + ". Empty retries count: " + emptyRetries);
        }

    } while (emptyRetries < maxRetries); // end timestamp loop

    //executor.shutdown();
    log.info("Finished reasoning, total inferred triples = " + totalInferredTriples);
    //log.info("Number of avoided duplicate terms = " + this.duplicates);
    log.info("Total rows retrieved from big data = " + this.totalRows);
    log.info("Total processed GBytes = " + ((double) this.totalBytes / FileUtils.ONE_GB));
    log.info("Total process reasoning time (serialization in inf file) = " + stopwatch1);
    log.info("Total time spent in empty inference cycles = " + stopwatch2);
}

From source file:io.ecarf.core.cloud.task.processor.reason.phase2.DoReasonTask6.java

@Override
public void run() throws IOException {

    GoogleCloudService cloud = (GoogleCloudService) this.getCloudService();

    //String table = metadata.getValue(EcarfMetaData.ECARF_TABLE);
    //Set<String> terms = metadata.getTerms();
    //String schemaFile = metadata.getValue(EcarfMetaData.ECARF_SCHEMA);
    //String bucket = metadata.getBucket();
    Stopwatch stopwatch1 = Stopwatch.createUnstarted();
    Stopwatch stopwatch2 = Stopwatch.createUnstarted();
    Set<String> termsSet;

    if (terms == null) {
        // too large, probably saved as a file
        //String termsFile = metadata.getValue(EcarfMetaData.ECARF_TERMS_FILE);
        log.info("Using json file for terms: " + termsFile);
        Validate.notNull(termsFile);//from   w  w  w .  j a va  2s .c  o  m

        String localTermsFile = Utils.TEMP_FOLDER + termsFile;
        cloud.downloadObjectFromCloudStorage(termsFile, localTermsFile, bucket);

        // convert from JSON
        termsSet = io.cloudex.framework.utils.FileUtils.jsonFileToSet(localTermsFile);

    } else {
        termsSet = ObjectUtils.csvToSet(terms);
    }

    String localSchemaFile = Utils.TEMP_FOLDER + schemaFile;
    // download the file from the cloud storage
    cloud.downloadObjectFromCloudStorage(schemaFile, localSchemaFile, bucket);

    // uncompress if compressed
    if (GzipUtils.isCompressedFilename(schemaFile)) {
        localSchemaFile = GzipUtils.getUncompressedFilename(localSchemaFile);
    }

    Map<String, Set<Triple>> allSchemaTriples = TripleUtils.getRelevantSchemaNTriples(localSchemaFile,
            TermUtils.RDFS_TBOX);

    // get all the triples we care about
    schemaTerms = new HashMap<>();

    for (String term : termsSet) {
        if (allSchemaTriples.containsKey(term)) {
            schemaTerms.put(term, allSchemaTriples.get(term));
        }
    }

    String decoratedTable = table;
    int emptyRetries = 0;
    int totalInferredTriples = 0;
    int maxRetries = Config.getIntegerProperty(Constants.REASON_RETRY_KEY, 6);
    String instanceId = cloud.getInstanceId();

    QueryGenerator<String> generator = new QueryGenerator<String>(schemaTerms, null);

    // timestamp loop
    do {

        Set<String> productiveTerms = new HashSet<>();
        int interimInferredTriples = 0;

        // First of all run all the queries asynchronously and remember the jobId and filename for each term

        List<QueryResult> queryResults = new ArrayList<QueryResult>();
        generator.setDecoratedTable(decoratedTable);

        List<String> queries = generator.getQueries();
        log.debug("Generated Queries: " + queries);
        String queryResultFilePrefix = Utils.TEMP_FOLDER + instanceId + '_' + System.currentTimeMillis()
                + "_QueryResults_";
        int fileCount = 0;
        for (String query : queries) {
            String jobId = cloud.startBigDataQuery(query);
            queryResults
                    .add(QueryResult.create().setFilename(queryResultFilePrefix + fileCount).setJobId(jobId));
            fileCount++;
        }

        // invoke all the queries in parallel
        //this.invokeAll(queryTasks);

        long start = System.currentTimeMillis();

        String inferredTriplesFile = Utils.TEMP_FOLDER + instanceId + '_' + start + Constants.DOT_INF;

        // save all the query results in files in parallel
        //this.invokeAll(saveTasks);

        for (QueryResult queryResult : queryResults) {
            try {
                // block and wait for each job to complete then save results to a file
                QueryStats stats = cloud.saveBigQueryResultsToFile(queryResult.getJobId(),
                        queryResult.getFilename());
                queryResult.setStats(stats);

            } catch (IOException ioe) {
                // transient backend errors
                log.warn("failed to save query results to file, jobId: " + queryResult.getJobId(), ioe);
                //TODO should throw an exception
            }
        }

        try (PrintWriter writer = new PrintWriter(
                new GZIPOutputStream(new FileOutputStream(inferredTriplesFile), Constants.GZIP_BUF_SIZE))) {

            // now loop through the queries
            //for(Entry<Term, Set<Triple>> entry: schemaTerms.entrySet()) {
            for (QueryResult queryResult : queryResults) {

                //Term term = entry.getKey();
                QueryStats stats = queryResult.getStats();

                BigInteger rows = stats.getTotalRows();//term.getRows();

                this.totalBytes = this.totalBytes + stats.getTotalProcessedBytes();//term.getBytes();

                // only process if triples are found matching this term
                if (!BigInteger.ZERO.equals(rows)) {

                    stopwatch1.start();

                    int inferredTriplesCount = this.inferAndSaveTriplesToFile(queryResult, productiveTerms,
                            decoratedTable, writer);

                    interimInferredTriples += inferredTriplesCount;

                    this.totalRows = this.totalRows.add(rows);

                    stopwatch1.stop();

                } else {
                    log.info("Skipping query as no data is found");
                }
            }
        }

        totalInferredTriples += interimInferredTriples;

        if (interimInferredTriples > 0) {

            // stream smaller numbers of inferred triples
            // try uploading from cloud storage
            int streamingThreshold = Config.getIntegerProperty("ecarf.io.reasoning.streaming.threshold",
                    100000);

            log.info("Inserting " + interimInferredTriples + ", inferred triples into Big Data table for "
                    + productiveTerms.size() + " productive terms. Filename: " + inferredTriplesFile);

            if (interimInferredTriples <= streamingThreshold) {
                // stream the data

                Set<Triple> inferredTriples = TripleUtils.loadCompressedCSVTriples(inferredTriplesFile, false);
                log.info("Total triples to stream into Big Data: " + inferredTriples.size());
                cloud.streamObjectsIntoBigData(inferredTriples, TableUtils.getBigQueryTripleTable(table));

                log.info("All inferred triples are streamed into Big Data table");

            } else {

                // load the data through cloud storage
                // upload the file to cloud storage
                log.info("Uploading inferred triples file into cloud storage: " + inferredTriplesFile);
                StorageObject file = cloud.uploadFileToCloudStorage(inferredTriplesFile, bucket);
                log.info("File " + file + ", uploaded successfully. Now loading it into big data.");

                String jobId = cloud.loadCloudStorageFilesIntoBigData(Lists.newArrayList(file.getUri()),
                        TableUtils.getBigQueryTripleTable(table), false);
                log.info(
                        "All inferred triples are loaded into Big Data table through cloud storage, completed jobId: "
                                + jobId);

            }

            // reset empty retries
            emptyRetries = 0;

            stopwatch2.reset();

        } else {
            log.info("No new inferred triples");
            // increment empty retries
            emptyRetries++;

            if (!stopwatch2.isRunning()) {
                stopwatch2.start();
            }
        }

        log.info("Total inferred triples so far = " + totalInferredTriples + ", current retry count: "
                + emptyRetries);

        if (emptyRetries < maxRetries) {
            ApiUtils.block(Config.getIntegerProperty(Constants.REASON_SLEEP_KEY, 20));

            // FIXME move into the particular cloud implementation service
            long elapsed = System.currentTimeMillis() - start;
            decoratedTable = "[" + table + "@-" + elapsed + "-]";

            log.info("Using table decorator: " + decoratedTable + ". Empty retries count: " + emptyRetries);
        }

    } while (emptyRetries < maxRetries); // end timestamp loop

    //executor.shutdown();
    log.info("Finished reasoning, total inferred triples = " + totalInferredTriples);
    log.info("Number of avoided duplicate terms = " + this.duplicates);
    log.info("Total rows retrieved from big data = " + this.totalRows);
    log.info("Total processed GBytes = " + ((double) this.totalBytes / FileUtils.ONE_GB));
    log.info("Total process reasoning time (serialization in inf file) = " + stopwatch1);
    log.info("Total time spent in empty inference cycles = " + stopwatch2);
}

From source file:io.ecarf.core.cloud.task.processor.reason.phase2.DoReasonTask8.java

@Override
public void run() throws IOException {

    GoogleCloudService cloud = (GoogleCloudService) this.getCloudService();

    Stopwatch stopwatch1 = Stopwatch.createUnstarted();
    Stopwatch stopwatch2 = Stopwatch.createUnstarted();
    Set<String> termsSet;

    if (terms == null) {
        // too large, probably saved as a file

        log.info("Using json file for terms: " + termsFile);
        Validate.notNull(termsFile);/* w  ww.ja v  a 2 s .com*/

        String localTermsFile = Utils.TEMP_FOLDER + termsFile;
        cloud.downloadObjectFromCloudStorage(termsFile, localTermsFile, bucket);

        // convert from JSON
        termsSet = io.cloudex.framework.utils.FileUtils.jsonFileToSet(localTermsFile);

    } else {
        termsSet = ObjectUtils.csvToSet(terms);
    }

    String localSchemaFile = Utils.TEMP_FOLDER + schemaFile;
    // download the file from the cloud storage
    cloud.downloadObjectFromCloudStorage(schemaFile, localSchemaFile, bucket);

    // uncompress if compressed
    if (GzipUtils.isCompressedFilename(schemaFile)) {
        localSchemaFile = GzipUtils.getUncompressedFilename(localSchemaFile);
    }

    Map<Long, Set<Triple>> allSchemaTriples = TripleUtils.getRelevantSchemaETriples(localSchemaFile,
            TermUtils.RDFS_TBOX);

    // get all the triples we care about
    schemaTerms = new HashMap<>();

    for (String termStr : termsSet) {

        Long term = Long.parseLong(termStr);

        if (allSchemaTriples.containsKey(term)) {
            schemaTerms.put(term, allSchemaTriples.get(term));
        }
    }

    String decoratedTable = table;
    int emptyRetries = 0;
    int totalInferredTriples = 0;
    int maxRetries = Config.getIntegerProperty(Constants.REASON_RETRY_KEY, 6);
    this.ddLimit = Config.getIntegerProperty(Constants.REASON_DATA_DIRECT_DOWNLOAD_LIMIT, 1_200_000);
    String instanceId = cloud.getInstanceId();

    QueryGenerator<Long> generator = new QueryGenerator<Long>(schemaTerms, null);

    // timestamp loop
    do {

        Set<Long> productiveTerms = new HashSet<>();
        int interimInferredTriples = 0;

        // First of all run all the queries asynchronously and remember the jobId and filename for each term

        List<QueryResult> queryResults = new ArrayList<QueryResult>();
        generator.setDecoratedTable(decoratedTable);

        List<String> queries = generator.getQueries();
        log.debug("Generated Queries: " + queries);
        String queryResultFilePrefix = instanceId + '_' + System.currentTimeMillis() + "_QueryResults_";
        int fileCount = 0;
        for (String query : queries) {
            String jobId = cloud.startBigDataQuery(query, new BigDataTable(this.table));
            queryResults
                    .add(QueryResult.create().setFilename(queryResultFilePrefix + fileCount).setJobId(jobId));
            fileCount++;
        }

        // invoke all the queries in parallel
        //this.invokeAll(queryTasks);

        long start = System.currentTimeMillis();

        String inferredTriplesFile = Utils.TEMP_FOLDER + instanceId + '_' + start + Constants.DOT_INF;

        // save all the query results in files in parallel
        //this.invokeAll(saveTasks);

        for (QueryResult queryResult : queryResults) {
            try {
                // block and wait for each job to complete then save results to a file
                QueryStats stats = cloud.saveBigQueryResultsToFile(queryResult.getJobId(),
                        queryResult.getFilename(), this.bucket, null, this.ddLimit);
                queryResult.setStats(stats);

            } catch (IOException ioe) {

                log.error("failed to save query results to file, jobId: " + queryResult.getJobId(), ioe);
                throw ioe;
            }
        }

        try (PrintWriter writer = new PrintWriter(
                new GZIPOutputStream(new FileOutputStream(inferredTriplesFile), Constants.GZIP_BUF_SIZE))) {

            // now loop through the queries
            //for(Entry<Term, Set<Triple>> entry: schemaTerms.entrySet()) {
            for (QueryResult queryResult : queryResults) {

                //Term term = entry.getKey();
                QueryStats stats = queryResult.getStats();

                BigInteger rows = stats.getTotalRows();//term.getRows();

                this.totalBytes = this.totalBytes + stats.getTotalProcessedBytes();//term.getBytes();

                // only process if triples are found matching this term
                if (!BigInteger.ZERO.equals(rows)) {

                    stopwatch1.start();

                    int inferredTriplesCount = this.inferAndSaveTriplesToFile(queryResult, productiveTerms,
                            writer);

                    interimInferredTriples += inferredTriplesCount;

                    this.totalRows = this.totalRows.add(rows);

                    stopwatch1.stop();

                } else {
                    log.info("Skipping query as no data is found");
                }
            }
        }

        totalInferredTriples += interimInferredTriples;

        if (interimInferredTriples > 0) {

            // stream smaller numbers of inferred triples
            // try uploading from cloud storage
            int streamingThreshold = Config.getIntegerProperty("ecarf.io.reasoning.streaming.threshold",
                    100000);

            log.info("Inserting " + interimInferredTriples + ", inferred triples into Big Data table for "
                    + productiveTerms.size() + " productive terms. Filename: " + inferredTriplesFile);

            if (interimInferredTriples <= streamingThreshold) {
                // stream the data

                Set<Triple> inferredTriples = TripleUtils.loadCompressedCSVTriples(inferredTriplesFile, true);
                log.info("Total triples to stream into Big Data: " + inferredTriples.size());
                cloud.streamObjectsIntoBigData(inferredTriples,
                        TableUtils.getBigQueryEncodedTripleTable(table));

                log.info("All inferred triples are streamed into Big Data table");

            } else {

                // load the data through cloud storage
                // upload the file to cloud storage
                log.info("Uploading inferred triples file into cloud storage: " + inferredTriplesFile);
                StorageObject file = cloud.uploadFileToCloudStorage(inferredTriplesFile, bucket);
                log.info("File " + file + ", uploaded successfully. Now loading it into big data.");

                String jobId = cloud.loadCloudStorageFilesIntoBigData(Lists.newArrayList(file.getUri()),
                        TableUtils.getBigQueryEncodedTripleTable(table), false);
                log.info(
                        "All inferred triples are loaded into Big Data table through cloud storage, completed jobId: "
                                + jobId);

            }

            // reset empty retries
            emptyRetries = 0;

            stopwatch2.reset();

        } else {
            log.info("No new inferred triples");
            // increment empty retries
            emptyRetries++;

            if (!stopwatch2.isRunning()) {
                stopwatch2.start();
            }
        }

        log.info("Total inferred triples so far = " + totalInferredTriples + ", current retry count: "
                + emptyRetries);

        if (emptyRetries < maxRetries) {
            ApiUtils.block(Config.getIntegerProperty(Constants.REASON_SLEEP_KEY, 20));

            // FIXME move into the particular cloud implementation service
            long elapsed = System.currentTimeMillis() - start;
            decoratedTable = "[" + table + "@-" + elapsed + "-]";

            log.info("Using table decorator: " + decoratedTable + ". Empty retries count: " + emptyRetries);
        }

    } while (emptyRetries < maxRetries); // end timestamp loop

    //executor.shutdown();
    log.info("Finished reasoning, total inferred triples = " + totalInferredTriples);
    //log.info("Number of avoided duplicate terms = " + this.duplicates);
    log.info("Total rows retrieved from big data = " + this.totalRows);
    log.info("Total processed GBytes = " + ((double) this.totalBytes / FileUtils.ONE_GB));
    log.info("Total process reasoning time (serialization in inf file) = " + stopwatch1);
    log.info("Total time spent in empty inference cycles = " + stopwatch2);
}

From source file:com.sunchenbin.store.feilong.core.io.FileUtil.java

/**
 * ??.// w ww  .j  a v a  2  s  .  c o m
 * 
 * <p>
 * ???? GB MB KB???? Bytes
 * </p>
 * 
 * <p>
 * Common-io 2.4{@link org.apache.commons.io.FileUtils#byteCountToDisplaySize(long)},GB ??1.99G 1G,apache ?
 * </p>
 * 
 * @param fileSize
 *            ? ??byte
 * @return ?byte ?
 * @see #getFileSize(File)
 * @see org.apache.commons.io.FileUtils#ONE_GB
 * @see org.apache.commons.io.FileUtils#ONE_MB
 * @see org.apache.commons.io.FileUtils#ONE_KB
 * 
 * @see org.apache.commons.io.FileUtils#byteCountToDisplaySize(long)
 */
public static String formatSize(long fileSize) {
    String danwei = "";
    long chushu = 1;// 
    if (fileSize >= FileUtils.ONE_GB) {
        danwei = "GB";
        chushu = FileUtils.ONE_GB;
    } else if (fileSize >= FileUtils.ONE_MB) {
        danwei = "MB";
        chushu = FileUtils.ONE_MB;
    } else if (fileSize >= FileUtils.ONE_KB) {
        danwei = "KB";
        chushu = FileUtils.ONE_KB;
    } else {
        return fileSize + "Bytes";
    }
    String yushu = 100 * (fileSize % chushu) / chushu + ""; // ?
    if ("0".equals(yushu)) {
        return fileSize / chushu + danwei;
    }
    return fileSize / chushu + "." + yushu + danwei;
}

From source file:org.apache.eagle.jpm.analyzer.mr.suggestion.MapReduceTaskNumProcessor.java

private String analyzeReduceTaskNum(List<String> optSettings) {
    StringBuilder sb = new StringBuilder();

    long numReduces = context.getNumReduces();
    if (numReduces > 0) {
        long avgReduceTime = context.getAvgReduceTimeInSec();
        long avgShuffleTime = context.getAvgShuffleTimeInSec();
        long avgShuffleBytes = context.getJob().getReduceCounters()
                .getCounterValue(JobCounters.CounterName.REDUCE_SHUFFLE_BYTES) / numReduces;
        long avgReduceOutput = context.getJob().getReduceCounters()
                .getCounterValue(JobCounters.CounterName.HDFS_BYTES_WRITTEN) / numReduces;
        long avgReduceTotalTime = avgShuffleTime + avgReduceTime;

        long suggestReduces = 0;
        StringBuilder tmpsb = new StringBuilder();

        String avgShuffleDisplaySize = bytesToHumanReadable(avgShuffleBytes);
        if (avgShuffleBytes < 256 * FileUtils.ONE_MB && avgReduceTotalTime < 300
                && avgReduceOutput < 256 * FileUtils.ONE_MB && numReduces > 1) {
            tmpsb.append("average reduce input bytes is: ").append(avgShuffleDisplaySize).append(", ");
            suggestReduces = getReduceNum(avgShuffleBytes, avgReduceOutput, avgReduceTime);
        } else if (avgShuffleBytes > 10 * FileUtils.ONE_GB && avgReduceTotalTime > 1800) {
            tmpsb.append("average reduce input bytes is: ").append(avgShuffleDisplaySize).append(", ");
            suggestReduces = getReduceNum(avgShuffleBytes, avgReduceOutput, avgReduceTime);
        }//from   w  w  w. j  a v a2 s . c  o  m

        if (avgReduceTotalTime < 60 && numReduces > 1) {
            tmpsb.append("average reduce time is only ").append(avgReduceTotalTime).append(" seconds, ");
            if (suggestReduces == 0) {
                suggestReduces = getReduceNum(avgShuffleBytes, avgReduceOutput, avgReduceTime);
            }
        } else if (avgReduceTotalTime > 3600 && avgReduceTime > 1800) {
            tmpsb.append("average reduce time is ").append(avgReduceTotalTime).append(" seconds, ");
            if (suggestReduces == 0) {
                suggestReduces = getReduceNum(avgShuffleBytes, avgReduceOutput, avgReduceTime);
            }
        }

        String avgReduceOutputDisplaySize = bytesToHumanReadable(avgReduceOutput);
        if (avgReduceOutput < 10 * FileUtils.ONE_MB && avgReduceTime < 300
                && avgShuffleBytes < 2 * FileUtils.ONE_GB && numReduces > 1) {
            tmpsb.append(" average reduce output is only ").append(avgReduceOutputDisplaySize).append(", ");
            if (suggestReduces == 0) {
                suggestReduces = getReduceNum(avgShuffleBytes, avgReduceOutput, avgReduceTime);
            }
        } else if (avgReduceOutput > 10 * FileUtils.ONE_GB && avgReduceTime > 1800) {
            tmpsb.append(" average reduce output is ").append(avgReduceOutputDisplaySize).append(", ");
            if (suggestReduces == 0) {
                suggestReduces = getReduceNum(avgShuffleBytes, avgReduceOutput, avgReduceTime);
            }
        }

        if (suggestReduces > 0) {
            sb.append("Best practice: ").append(tmpsb.toString()).append("please consider ");
            if (suggestReduces > numReduces) {
                sb.append("increasing the ");
            } else {
                sb.append("decreasing the ");
            }
            String setting = String.format("-D%s=%s", NUM_REDUCES, suggestReduces);
            sb.append("reducer number. You could try ").append(setting).append("\n");
            optSettings.add(setting);
        }
    }
    return sb.toString();
}

From source file:org.apache.eagle.jpm.analyzer.mr.suggestion.MapReduceTaskNumProcessor.java

private String analyzeMapTaskNum(List<String> optSettings) {
    StringBuilder sb = new StringBuilder();

    long numMaps = context.getNumMaps();
    long avgMapTime = context.getAvgMapTimeInSec();
    long avgMapInput = context.getJob().getMapCounters()
            .getCounterValue(JobCounters.CounterName.HDFS_BYTES_READ) / numMaps;
    String avgMapInputDisplaySize = bytesToHumanReadable(avgMapInput);

    if (avgMapInput < 5 * FileUtils.ONE_MB && avgMapTime < 30 && numMaps > 1) {
        sb.append("Best practice: average map input bytes only have ").append(avgMapInputDisplaySize);
        sb.append(". Please reduce the number of mappers by merging input files.\n");
    } else if (avgMapInput > FileUtils.ONE_GB) {
        sb.append("Best practice: average map input bytes have ").append(avgMapInputDisplaySize);
        sb.append(//w w  w . j  a v a  2  s.  c o  m
                ". Please increase the number of mappers by using splittable compression, a container file format or a smaller block size.\n");
    }

    if (avgMapTime < 10 && numMaps > 1) {
        sb.append("Best practice: average map time only have ").append(avgMapTime);
        sb.append(
                " seconds. Please reduce the number of mappers by merging input files or by using a larger block size.\n");
    } else if (avgMapTime > 600 && avgMapInput < FileUtils.ONE_GB) {
        sb.append("Best practice: average map time is ").append(avgMapInput);
        sb.append(
                " seconds. Please increase the number of mappers by using splittable compression, a container file format or a smaller block size.\n");
    }

    return sb.toString();
}

From source file:org.apache.eagle.jpm.analyzer.mr.suggestion.MapReduceTaskNumProcessor.java

private long getReduceNum(long avgInputBytes, long avgOutputBytes, long avgTime) {
    long newReduceNum = 1;
    long tmpReduceNum;

    long numReduces = context.getNumReduces();
    tmpReduceNum = avgInputBytes * numReduces / (3 * FileUtils.ONE_GB);
    if (tmpReduceNum > newReduceNum) {
        newReduceNum = tmpReduceNum;//from   w w w. j a  v  a2  s.  co m
    }

    tmpReduceNum = avgOutputBytes * numReduces / (2 * FileUtils.ONE_GB);
    if (tmpReduceNum > newReduceNum) {
        newReduceNum = tmpReduceNum;
    }

    tmpReduceNum = avgTime * numReduces / (10 * 60);
    if (tmpReduceNum > newReduceNum) {
        newReduceNum = tmpReduceNum;
    }

    return newReduceNum;
}

From source file:org.apache.usergrid.services.assets.data.S3BinaryStore.java

@Override
public void write(final UUID appId, final Entity entity, InputStream inputStream) throws IOException {

    String uploadFileName = AssetUtils.buildAssetKey(appId, entity);
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    long written = IOUtils.copyLarge(inputStream, baos, 0, FIVE_MB);
    byte[] data = baos.toByteArray();

    final Map<String, Object> fileMetadata = AssetUtils.getFileMetadata(entity);
    fileMetadata.put(AssetUtils.LAST_MODIFIED, System.currentTimeMillis());

    String mimeType = AssetMimeHandler.get().getMimeType(entity, data);

    if (written < FIVE_MB) { // total smaller than 5mb

        BlobStore blobStore = getContext().getBlobStore();
        BlobBuilder.PayloadBlobBuilder bb = blobStore.blobBuilder(uploadFileName).payload(data).calculateMD5()
                .contentType(mimeType);/*  w w  w .  j  a va 2 s.c  o  m*/

        fileMetadata.put(AssetUtils.CONTENT_LENGTH, written);
        if (fileMetadata.get(AssetUtils.CONTENT_DISPOSITION) != null) {
            bb.contentDisposition(fileMetadata.get(AssetUtils.CONTENT_DISPOSITION).toString());
        }
        final Blob blob = bb.build();

        String md5sum = Hex.encodeHexString(blob.getMetadata().getContentMetadata().getContentMD5());
        fileMetadata.put(AssetUtils.CHECKSUM, md5sum);

        String eTag = blobStore.putBlob(bucketName, blob);
        fileMetadata.put(AssetUtils.E_TAG, eTag);
    } else { // bigger than 5mb... dump 5 mb tmp files and upload from them

        // todo: yes, AsyncBlobStore is deprecated, but there appears to be no replacement yet
        final AsyncBlobStore blobStore = getContext().getAsyncBlobStore();

        File tempFile = File.createTempFile(entity.getUuid().toString(), "tmp");
        tempFile.deleteOnExit();
        OutputStream os = null;
        try {
            os = new BufferedOutputStream(new FileOutputStream(tempFile.getAbsolutePath()));
            os.write(data);
            written += IOUtils.copyLarge(inputStream, os, 0, (FileUtils.ONE_GB * 5));
        } finally {
            IOUtils.closeQuietly(os);
        }

        BlobBuilder.PayloadBlobBuilder bb = blobStore.blobBuilder(uploadFileName).payload(tempFile)
                .calculateMD5().contentType(mimeType);

        fileMetadata.put(AssetUtils.CONTENT_LENGTH, written);
        if (fileMetadata.get(AssetUtils.CONTENT_DISPOSITION) != null) {
            bb.contentDisposition(fileMetadata.get(AssetUtils.CONTENT_DISPOSITION).toString());
        }
        final Blob blob = bb.build();

        final File finalTempFile = tempFile;
        final ListenableFuture<String> future = blobStore.putBlob(bucketName, blob,
                PutOptions.Builder.multipart());

        Runnable listener = new Runnable() {
            @Override
            public void run() {
                try {
                    String eTag = future.get();
                    fileMetadata.put(AssetUtils.E_TAG, eTag);
                    EntityManager em = emf.getEntityManager(appId);
                    em.update(entity);
                    finalTempFile.delete();
                } catch (Exception e) {
                    LOG.error("error uploading", e);
                }
                if (finalTempFile != null && finalTempFile.exists()) {
                    finalTempFile.delete();
                }
            }
        };
        future.addListener(listener, executor);
    }
}