io.ecarf.core.cloud.task.processor.ProcessLoadTask.java Source code

Introduction

Here is the source code for io.ecarf.core.cloud.task.processor.ProcessLoadTask.java
Source

/**
 * The contents of this file may be used under the terms of the Apache License, Version 2.0
 * in which case, the provisions of the Apache License Version 2.0 are applicable instead of those above.
 *
 * Copyright 2014, Ecarf.io
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.ecarf.core.cloud.task.processor;

import io.cloudex.framework.utils.FileUtils;
import io.ecarf.core.cloud.impl.google.EcarfGoogleCloudService;
import io.ecarf.core.cloud.task.processor.files.ProcessFilesTask;
import io.ecarf.core.term.TermCounter;
import io.ecarf.core.term.dictionary.TermDictionary;
import io.ecarf.core.term.dictionary.TermDictionaryCore;
import io.ecarf.core.utils.Constants;
import io.ecarf.core.utils.FilenameUtils;
import io.ecarf.core.utils.Utils;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Callable;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import com.google.common.base.Stopwatch;

/**
 * EVM Task to load the provided cloud files into the big data cloud storage
 * Does also analyse the terms as they are being processed
 * read the files from http:// or from gs://
 * download files locally (gziped)
 * read through the files counting the relevant terms and rewriting 
 * into bigquery format (comma separated)
 * Makes use of CPU multi-cores for parallel processing
 * 
 * @author Omer Dawelbeit (omerio)
 *
 */
public class ProcessLoadTask extends ProcessFilesTask<TermCounter> {

    private final static Log log = LogFactory.getLog(ProcessLoadTask.class);

    private String bucket;

    private String sourceBucket;

    private String schemaTermsFile;

    private String countOnly;

    private String encode;

    private Set<String> schemaTerms;

    private Map<String, Integer> count = new HashMap<>();

    private TermDictionary dictionary;

    private String dictionaryFile;

    /* 
     * // TODO distinguish between files in cloud storage vs files downloaded from http or https url
     * (non-Javadoc)
     * @see io.ecarf.core.cloud.task.Task#run()
     */
    @Override
    public void run() throws IOException {

        log.info("START: processing files, memory usage: " + Utils.getMemoryUsageInGB() + "GB" + ", timer: 0s");
        Stopwatch stopwatch = Stopwatch.createStarted();

        EcarfGoogleCloudService cloudService = (EcarfGoogleCloudService) this.getCloudService();

        log.info("Downloading schema terms file: " + schemaTermsFile);

        this.schemaTerms = cloudService.getSetFromCloudStorageFile(schemaTermsFile, bucket);

        String localDictionaryFile = null;

        if (Boolean.valueOf(encode) && StringUtils.isNotBlank(this.dictionaryFile)) {

            log.info("Downloading and loading dictionary into memory from file: " + this.dictionaryFile);

            // 1- Download the dictionary
            localDictionaryFile = Utils.TEMP_FOLDER + dictionaryFile;

            if (FilenameUtils.fileExists(localDictionaryFile)) {
                log.info("Re-using local file: " + localDictionaryFile);

            } else {
                this.cloudService.downloadObjectFromCloudStorage(dictionaryFile, localDictionaryFile, this.bucket);
            }

            log.info("Loading the dictionary from file: " + localDictionaryFile + ", memory usage: "
                    + Utils.getMemoryUsageInGB() + "GB, timer: " + stopwatch);

            // 2- Load the dictionary
            try {

                dictionary = Utils.objectFromFile(localDictionaryFile, TermDictionaryCore.class, true, false);

            } catch (ClassNotFoundException e) {

                log.error("Failed to load the dictionary", e);
                throw new IOException(e);
            }

            log.info("Dictionary loaded successfully, memory usage: " + Utils.getMemoryUsageInGB() + "GB, timer: "
                    + stopwatch);
        }

        super.run();

        // write term stats to file and upload
        if (!count.isEmpty()) {

            log.info("Saving terms stats");
            String countStatsFile = Utils.TEMP_FOLDER + cloudService.getInstanceId() + Constants.DOT_JSON;
            FileUtils.objectToJsonFile(countStatsFile, count);

            cloudService.uploadFileToCloudStorage(countStatsFile, bucket);
        }

        // delete the dictionary file, free up some space
        if (localDictionaryFile != null) {
            FileUtils.deleteFile(localDictionaryFile);
            // garbage collection
            this.dictionary = null;
        }

        log.info("FINISH: All files are processed and uploaded successfully,, memory usage: "
                + Utils.getMemoryUsageInGB() + "GB, timer: " + stopwatch);
    }

    /**
     * Return all the process tasks
     */
    @Override
    public List<Callable<TermCounter>> getSubTasks(Set<String> files) {
        List<Callable<TermCounter>> tasks = new ArrayList<>();

        if (StringUtils.isBlank(sourceBucket)) {
            log.warn("sourceBucket is empty, using bucket: " + this.bucket);
            this.sourceBucket = this.bucket;
        }

        for (final String file : files) {

            TermCounter counter = null;

            if (schemaTerms != null) {
                counter = new TermCounter();
                counter.setTermsToCount(schemaTerms);
            }

            ProcessFilesForBigQuerySubTask task = new ProcessFilesForBigQuerySubTask(file, bucket, sourceBucket,
                    counter, this.dictionary, Boolean.valueOf(countOnly), Boolean.valueOf(encode),
                    this.getCloudService());
            tasks.add(task);

        }

        return tasks;
    }

    /*
     * (non-Javadoc)
     * @see io.ecarf.core.cloud.task.processor.files.ProcessFilesTask#processSingleOutput(java.lang.Object)
     */
    @Override
    public void processSingleOutput(TermCounter counter) {
        if (counter != null) {
            count = counter.getCount();
        }

    }

    /*
     * (non-Javadoc)
     * @see io.ecarf.core.cloud.task.processor.files.ProcessFilesTask#processMultiOutput(java.util.List)
     */
    @Override
    public void processMultiOutput(List<TermCounter> counters) {
        for (TermCounter counter : counters) {
            if (counter != null) {
                Utils.mergeCountMaps(count, counter.getCount());
            }
        }
    }

    /**
     * @return the bucket
     */
    public String getBucket() {
        return bucket;
    }

    /**
     * @param bucket the bucket to set
     */
    public void setBucket(String bucket) {
        this.bucket = bucket;
    }

    /**
     * @return the schemaTermsFile
     */
    public String getSchemaTermsFile() {
        return schemaTermsFile;
    }

    /**
     * @param schemaTermsFile the schemaTermsFile to set
     */
    public void setSchemaTermsFile(String schemaTermsFile) {
        this.schemaTermsFile = schemaTermsFile;
    }

    /**
     * @return the schemaTerms
     */
    public Set<String> getSchemaTerms() {
        return schemaTerms;
    }

    /**
     * @return the count
     */
    public Map<String, Integer> getCount() {
        return count;
    }

    /**
     * @return the countOnly
     */
    public String getCountOnly() {
        return countOnly;
    }

    /**
     * @param countOnly the countOnly to set
     */
    public void setCountOnly(String countOnly) {
        this.countOnly = countOnly;
    }

    /**
     * @return the sourceBucket
     */
    public String getSourceBucket() {
        return sourceBucket;
    }

    /**
     * @param sourceBucket the sourceBucket to set
     */
    public void setSourceBucket(String sourceBucket) {
        this.sourceBucket = sourceBucket;
    }

    /**
     * @return the encode
     */
    public String getEncode() {
        return encode;
    }

    /**
     * @param encode the encode to set
     */
    public void setEncode(String encode) {
        this.encode = encode;
    }

    /**
     * @return the dictionaryFile
     */
    public String getDictionaryFile() {
        return dictionaryFile;
    }

    /**
     * @param dictionaryFile the dictionaryFile to set
     */
    public void setDictionaryFile(String dictionaryFile) {
        this.dictionaryFile = dictionaryFile;
    }

}