org.rdfhdt.mrbuilder.HDTBuilderDriver.java Source code

Introduction

Here is the source code for org.rdfhdt.mrbuilder.HDTBuilderDriver.java
Source

/**
 * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *
 * Contacting the authors:
 *   Jose M. Gimenez-Garcia:    josemiguel.gimenez@alumnos.uva.es
 *   Javier D. Fernandez:       jfergar@infor.uva.es, javier.fernandez@wu.ac.at
 *   Miguel A. Martinez-Prieto: migumar2@infor.uva.es
 */
package org.rdfhdt.mrbuilder;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Arrays;

import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.InputSampler;
import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner;
import org.rdfhdt.hdt.dictionary.impl.FourSectionDictionary;
import org.rdfhdt.hdt.dictionary.impl.FourSectionDictionary2;
import org.rdfhdt.hdt.dictionary.impl.section.TransientDictionarySection;
import org.rdfhdt.hdt.hdt.impl.TransientHDT;
import org.rdfhdt.hdt.options.ControlInformation;
import org.rdfhdt.hdt.trans.TransientElement;
import org.rdfhdt.hdt.triples.impl.TransientBitMapTriples;
import org.rdfhdt.listener.HDTBuilderListener;
import org.rdfhdt.mrbuilder.dictionary.DictionaryCombiner;
import org.rdfhdt.mrbuilder.dictionary.DictionaryMapper;
import org.rdfhdt.mrbuilder.dictionary.DictionaryReducer;
import org.rdfhdt.mrbuilder.dictionary.DictionarySamplerMapper;
import org.rdfhdt.mrbuilder.dictionary.DictionarySamplerReducer;
import org.rdfhdt.mrbuilder.io.TripleSPOComparator;
import org.rdfhdt.mrbuilder.io.TripleSPOWritable;
import org.rdfhdt.mrbuilder.triples.TriplesSPOMapper;
import org.rdfhdt.mrbuilder.util.FileStatusComparator;

import com.hadoop.mapreduce.LzoTextInputFormat;

public class HDTBuilderDriver {

    public enum Counters {
        Triples, Subjects, Predicates, Objects, Shared, Sample
    }

    protected HDTBuilderConfiguration conf;
    protected HDTBuilderListener listener;
    protected FileSystem inputFS, dictionaryFS, triplesFS;
    protected Long numTriples = null, numShared = null, numSubjects = null, numPredicates = null, numObjects = null;
    protected FourSectionDictionary2 dictionary = null;

    public HDTBuilderDriver(String[] args) throws IOException {

        // load configuration
        this.conf = new HDTBuilderConfiguration(args);

        this.listener = new HDTBuilderListener(this.conf);

        // get the FileSystem instances for each path
        this.inputFS = this.conf.getInputPath().getFileSystem(this.conf.getConfigurationObject());
        this.dictionaryFS = this.conf.getDictionaryOutputPath().getFileSystem(this.conf.getConfigurationObject());
        this.triplesFS = this.conf.getTriplesOutputPath().getFileSystem(this.conf.getConfigurationObject());

    }

    public static void main(String[] args) throws Exception {
        boolean ok = true;
        HDTBuilderDriver driver = new HDTBuilderDriver(args);

        if (ok && driver.conf.runDictionarySampling()) {
            if (driver.conf.getDictionaryReducers() == 1) {
                System.out.println(
                        "WARNING: Only one Reducer. Dictionary creation as a single job is more efficient.");
            }
            ok = driver.runDictionaryJobSampling();
        }

        if (ok && driver.conf.runDictionary()) {
            if (driver.conf.getDictionaryReducers() > 1) {
                ok = driver.runDictionaryJob();
            } else {
                ok = driver.runDictionaryJobWithOneJob();
            }
        }

        if (ok && driver.conf.buildDictionary()) {
            ok = driver.buildDictionary();
        }

        if (ok && driver.conf.runTriplesSampling()) {
            if (driver.conf.getTriplesReducers() == 1) {
                System.out
                        .println("WARNING: Only one Reducer. Triples creation as a single job is more efficient.");
            }
            ok = driver.runTriplesJobSampling();
        }

        if (ok && driver.conf.runTriples()) {
            if (driver.conf.getTriplesReducers() > 1) {
                ok = driver.runTriplesJob();
            } else {
                ok = driver.runTriplesJobWithOneJob();
            }
        }

        if (ok && driver.conf.buildHDT()) {
            ok = driver.buidHDT();
        }

        System.exit(ok ? 0 : 1);
    }

    protected boolean runDictionaryJobSampling() throws IOException, ClassNotFoundException, InterruptedException {
        boolean jobOK;
        Job job = null;

        // if input path does not exists, fail
        if (!this.inputFS.exists(this.conf.getInputPath())) {
            System.out.println("Dictionary input path does not exist: " + this.conf.getInputPath());
            System.exit(-1);
        }

        // if samples path exists...
        if (this.dictionaryFS.exists(this.conf.getDictionarySamplesPath())) {
            if (this.conf.getDeleteDictionarySamplesPath()) { // ... and option provided, delete recursively
                this.dictionaryFS.delete(this.conf.getDictionarySamplesPath(), true);
            } else { // ... and option not provided, fail
                System.out.println("Dictionary samples path does exist: " + this.conf.getDictionarySamplesPath());
                System.out.println("Select other path or use option -ds to overwrite");
                System.exit(-1);
            }
        }

        // Job to create a SequenceInputFormat with Roles
        job = new Job(this.conf.getConfigurationObject(), this.conf.getDictionaryJobName() + " phase 1");
        job.setJarByClass(HDTBuilderDriver.class);

        System.out.println("input = " + this.conf.getInputPath());
        System.out.println("samples = " + this.conf.getDictionarySamplesPath());

        FileInputFormat.addInputPath(job, this.conf.getInputPath());
        FileOutputFormat.setOutputPath(job, this.conf.getDictionarySamplesPath());

        job.setInputFormatClass(LzoTextInputFormat.class);
        LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

        job.setMapperClass(DictionarySamplerMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        job.setCombinerClass(DictionarySamplerReducer.class);
        job.setReducerClass(DictionarySamplerReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        job.setNumReduceTasks(this.conf.getDictionarySampleReducers());

        SequenceFileOutputFormat.setCompressOutput(job, true);
        SequenceFileOutputFormat.setOutputCompressorClass(job, com.hadoop.compression.lzo.LzoCodec.class);
        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

        jobOK = job.waitForCompletion(true);

        return jobOK;
    }

    protected boolean runDictionaryJob()
            throws ClassNotFoundException, IOException, InterruptedException, URISyntaxException {
        boolean jobOK;
        Job job = null;
        BufferedWriter bufferedWriter;

        // if output path exists...
        if (this.dictionaryFS.exists(this.conf.getDictionaryOutputPath())) {
            if (this.conf.getDeleteDictionaryOutputPath()) { // ... and option provided, delete recursively
                this.dictionaryFS.delete(this.conf.getDictionaryOutputPath(), true);
            } else { // ... and option not provided, fail
                System.out.println("Dictionary output path does exist: " + this.conf.getDictionaryOutputPath());
                System.out.println("Select other path or use option -dd to overwrite");
                System.exit(-1);
            }
        }

        // Sample the SequenceInputFormat to do TotalSort and create final output
        job = new Job(this.conf.getConfigurationObject(), this.conf.getDictionaryJobName() + " phase 2");

        job.setJarByClass(HDTBuilderDriver.class);

        System.out.println("samples = " + this.conf.getDictionarySamplesPath());
        System.out.println("output = " + this.conf.getDictionaryOutputPath());

        FileInputFormat.addInputPath(job, this.conf.getDictionarySamplesPath());
        FileOutputFormat.setOutputPath(job, this.conf.getDictionaryOutputPath());

        job.setInputFormatClass(SequenceFileInputFormat.class);
        LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

        // Identity Mapper
        // job.setMapperClass(Mapper.class);
        job.setCombinerClass(DictionaryCombiner.class);
        job.setPartitionerClass(TotalOrderPartitioner.class);
        job.setReducerClass(DictionaryReducer.class);

        job.setNumReduceTasks(this.conf.getDictionaryReducers());

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        System.out.println("Sampling started");
        InputSampler.writePartitionFile(job,
                new InputSampler.IntervalSampler<Text, Text>(this.conf.getSampleProbability()));
        String partitionFile = TotalOrderPartitioner.getPartitionFile(job.getConfiguration());
        URI partitionUri = new URI(partitionFile + "#" + TotalOrderPartitioner.DEFAULT_PATH);
        DistributedCache.addCacheFile(partitionUri, job.getConfiguration());
        DistributedCache.createSymlink(job.getConfiguration());
        System.out.println("Sampling finished");

        MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.SHARED, SequenceFileOutputFormat.class,
                Text.class, NullWritable.class);
        MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.SUBJECTS, SequenceFileOutputFormat.class,
                Text.class, NullWritable.class);
        MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.PREDICATES, SequenceFileOutputFormat.class,
                Text.class, NullWritable.class);
        MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.OBJECTS, SequenceFileOutputFormat.class,
                Text.class, NullWritable.class);

        SequenceFileOutputFormat.setCompressOutput(job, true);
        SequenceFileOutputFormat.setOutputCompressorClass(job, com.hadoop.compression.lzo.LzoCodec.class);
        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

        jobOK = job.waitForCompletion(true);

        this.numShared = job.getCounters().findCounter(Counters.Shared).getValue();
        this.numSubjects = job.getCounters().findCounter(Counters.Subjects).getValue();
        this.numPredicates = job.getCounters().findCounter(Counters.Predicates).getValue();
        this.numObjects = job.getCounters().findCounter(Counters.Objects).getValue();

        bufferedWriter = new BufferedWriter(
                new OutputStreamWriter(this.dictionaryFS.create(this.conf.getDictionaryCountersFile())));

        bufferedWriter.write(HDTBuilderConfiguration.SHARED + "=" + this.numShared + "\n");
        bufferedWriter.write(HDTBuilderConfiguration.SUBJECTS + "=" + this.numSubjects + "\n");
        bufferedWriter.write(HDTBuilderConfiguration.PREDICATES + "=" + this.numPredicates + "\n");
        bufferedWriter.write(HDTBuilderConfiguration.OBJECTS + "=" + this.numObjects + "\n");

        bufferedWriter.close();

        return jobOK;
    }

    protected boolean runDictionaryJobWithOneJob()
            throws ClassNotFoundException, IOException, InterruptedException, URISyntaxException {
        boolean jobOK;
        Job job = null;
        BufferedWriter bufferedWriter;

        // if input path does not exists, fail
        if (!this.inputFS.exists(this.conf.getInputPath())) {
            System.out.println("Dictionary input path does not exist: " + this.conf.getInputPath());
            System.exit(-1);
        }

        // if output path exists...
        if (this.dictionaryFS.exists(this.conf.getDictionaryOutputPath())) {
            if (this.conf.getDeleteDictionaryOutputPath()) { // ... and option provided, delete recursively
                this.dictionaryFS.delete(this.conf.getDictionaryOutputPath(), true);
            } else { // ... and option not provided, fail
                System.out.println("Dictionary output path does exist: " + this.conf.getDictionaryOutputPath());
                System.out.println("Select other path or use option -dd to overwrite");
                System.exit(-1);
            }
        }

        // Launch job
        job = new Job(this.conf.getConfigurationObject(), this.conf.getTriplesJobName());
        job.setJarByClass(HDTBuilderDriver.class);

        FileInputFormat.addInputPath(job, this.conf.getInputPath());
        FileOutputFormat.setOutputPath(job, this.conf.getDictionaryOutputPath());

        job.setInputFormatClass(LzoTextInputFormat.class);
        LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

        job.setMapperClass(DictionaryMapper.class);
        job.setCombinerClass(DictionaryCombiner.class);
        job.setReducerClass(DictionaryReducer.class);

        job.setNumReduceTasks(this.conf.getDictionaryReducers());

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.SHARED, SequenceFileOutputFormat.class,
                Text.class, NullWritable.class);
        MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.SUBJECTS, SequenceFileOutputFormat.class,
                Text.class, NullWritable.class);
        MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.PREDICATES, SequenceFileOutputFormat.class,
                Text.class, NullWritable.class);
        MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.OBJECTS, SequenceFileOutputFormat.class,
                Text.class, NullWritable.class);

        jobOK = job.waitForCompletion(true);

        this.numShared = job.getCounters().findCounter(Counters.Shared).getValue();
        this.numSubjects = job.getCounters().findCounter(Counters.Subjects).getValue();
        this.numPredicates = job.getCounters().findCounter(Counters.Predicates).getValue();
        this.numObjects = job.getCounters().findCounter(Counters.Objects).getValue();

        bufferedWriter = new BufferedWriter(
                new OutputStreamWriter(this.dictionaryFS.create(this.conf.getDictionaryCountersFile())));

        bufferedWriter.write(HDTBuilderConfiguration.SHARED + "=" + this.numShared + "\n");
        bufferedWriter.write(HDTBuilderConfiguration.SUBJECTS + "=" + this.numSubjects + "\n");
        bufferedWriter.write(HDTBuilderConfiguration.PREDICATES + "=" + this.numPredicates + "\n");
        bufferedWriter.write(HDTBuilderConfiguration.OBJECTS + "=" + this.numObjects + "\n");

        bufferedWriter.close();

        return jobOK;
    }

    protected boolean buildDictionary() throws IOException {
        FourSectionDictionary dictionary4mappers, dictionary4reducers;

        // if job not ran, read Counters
        if (!this.conf.runDictionary()) {

            System.out.println("Dictionary job not ran. Reading data from file.");

            BufferedReader reader = new BufferedReader(
                    new InputStreamReader(this.dictionaryFS.open(this.conf.getDictionaryCountersFile())));
            String line = reader.readLine();
            while (line != null) {
                String[] data = line.split("=");
                switch (data[0]) {
                case HDTBuilderConfiguration.SHARED:
                    this.numShared = Long.parseLong(data[1]);
                    break;
                case HDTBuilderConfiguration.SUBJECTS:
                    this.numSubjects = Long.parseLong(data[1]);
                    break;
                case HDTBuilderConfiguration.PREDICATES:
                    this.numPredicates = Long.parseLong(data[1]);
                    break;
                case HDTBuilderConfiguration.OBJECTS:
                    this.numObjects = Long.parseLong(data[1]);
                }
                line = reader.readLine();
            }
            reader.close();
        }

        TransientDictionarySection shared = new TransientDictionarySection(this.conf.getSpec());
        TransientDictionarySection subjects = new TransientDictionarySection(this.conf.getSpec());
        TransientDictionarySection predicates = new TransientDictionarySection(this.conf.getSpec());
        TransientDictionarySection objects = new TransientDictionarySection(this.conf.getSpec());

        if (this.dictionaryFS.exists(this.conf.getSharedSectionPath())) {
            System.out.println("Shared section = " + this.conf.getSharedSectionPath());
            this.loadFromDir(shared, this.numShared, this.dictionaryFS, this.conf.getSharedSectionPath());
        }

        this.loadFromDir(subjects, this.numSubjects, this.dictionaryFS, this.conf.getSubjectsSectionPath());
        this.loadFromDir(predicates, this.numPredicates, this.dictionaryFS, this.conf.getPredicatesSectionPath());
        this.loadFromDir(objects, this.numObjects, this.dictionaryFS, this.conf.getObjectsSectionPath());

        System.out.println("Saving dictionary...");
        this.dictionary = new FourSectionDictionary2(this.conf.getSpec(), subjects, predicates, objects, shared);
        this.saveDictionary(this.dictionary, this.dictionaryFS, this.conf.getDictionaryFile());

        return true;

    }

    protected boolean runTriplesJobSampling() throws ClassNotFoundException, IOException, InterruptedException {
        Job job = null;
        boolean jobOK;
        BufferedWriter bufferedWriter;

        // if input path does not exists, fail
        if (!this.inputFS.exists(this.conf.getInputPath())) {
            System.out.println("Dictionary input path does not exist: " + this.conf.getInputPath());
            System.exit(-1);
        }

        // if dictionary output path does not exists, fail
        if (!this.dictionaryFS.exists(this.conf.getInputPath())) {
            System.out.println("Dictionary output path does not exist: " + this.conf.getInputPath());
            System.exit(-1);
        }

        // if samples path exists, fail
        if (this.dictionaryFS.exists(this.conf.getTriplesSamplesPath())) {
            if (this.conf.getDeleteTriplesSamplesPath()) { // ... and option
                // provided, delete
                // recursively
                this.dictionaryFS.delete(this.conf.getTriplesSamplesPath(), true);
            } else { // ... and option not provided, fail
                System.out.println("Triples samples path does exist: " + this.conf.getTriplesSamplesPath());
                System.out.println("Select other path or use option -dst to overwrite");
                System.exit(-1);
            }
        }

        this.conf.setProperty("mapred.child.java.opts",
                "-XX:ErrorFile=/home/hadoop/tmp/hs_err_pid%p.log -Xmx2500m");

        // Job to create a SequenceInputFormat
        job = new Job(this.conf.getConfigurationObject(), this.conf.getTriplesJobName() + " phase 1");

        job.setJarByClass(HDTBuilderDriver.class);

        FileInputFormat.addInputPath(job, this.conf.getInputPath());
        FileOutputFormat.setOutputPath(job, this.conf.getTriplesSamplesPath());

        job.setInputFormatClass(LzoTextInputFormat.class);
        LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

        job.setMapperClass(TriplesSPOMapper.class);
        job.setSortComparatorClass(TripleSPOComparator.class);
        job.setGroupingComparatorClass(TripleSPOComparator.class);
        job.setMapOutputKeyClass(TripleSPOWritable.class);
        job.setMapOutputValueClass(NullWritable.class);
        job.setOutputKeyClass(TripleSPOWritable.class);
        job.setOutputValueClass(NullWritable.class);

        job.setNumReduceTasks(this.conf.getTriplesReducers());

        DistributedCache.addCacheFile(this.conf.getDictionaryFile().toUri(), job.getConfiguration());

        SequenceFileOutputFormat.setCompressOutput(job, true);
        SequenceFileOutputFormat.setOutputCompressorClass(job, com.hadoop.compression.lzo.LzoCodec.class);
        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

        jobOK = job.waitForCompletion(true);

        this.numTriples = job.getCounters().findCounter(Counters.Triples).getValue();
        bufferedWriter = new BufferedWriter(
                new OutputStreamWriter(this.triplesFS.create(this.conf.getTriplesCountersFile())));
        bufferedWriter.write(this.numTriples.toString() + "\n");
        bufferedWriter.close();

        return jobOK;
    }

    protected boolean runTriplesJob()
            throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
        Job job = null;
        boolean jobOK;

        // if triples output path exists...
        if (this.triplesFS.exists(this.conf.getTriplesOutputPath())) {
            if (this.conf.getDeleteTriplesOutputPath()) { // ... and option provided, delete recursively
                this.triplesFS.delete(this.conf.getTriplesOutputPath(), true);
            } else { // ... and option not provided, fail
                System.out.println("Triples output path does exist: " + this.conf.getTriplesOutputPath());
                System.out.println("Select other path or use option -dt to overwrite");
                System.exit(-1);
            }
        }

        job = new Job(this.conf.getConfigurationObject(), this.conf.getTriplesJobName() + " phase 2");

        job.setJarByClass(HDTBuilderDriver.class);

        FileInputFormat.addInputPath(job, this.conf.getTriplesSamplesPath());
        FileOutputFormat.setOutputPath(job, this.conf.getTriplesOutputPath());

        job.setInputFormatClass(SequenceFileInputFormat.class);
        LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

        job.setSortComparatorClass(TripleSPOComparator.class);
        job.setGroupingComparatorClass(TripleSPOComparator.class);

        job.setPartitionerClass(TotalOrderPartitioner.class);

        job.setOutputKeyClass(TripleSPOWritable.class);
        job.setOutputValueClass(NullWritable.class);

        job.setNumReduceTasks(this.conf.getTriplesReducers());

        System.out.println("Sampling started");
        InputSampler.writePartitionFile(job,
                new InputSampler.IntervalSampler<Text, Text>(this.conf.getSampleProbability()));
        String partitionFile = TotalOrderPartitioner.getPartitionFile(job.getConfiguration());
        URI partitionUri = new URI(partitionFile + "#" + TotalOrderPartitioner.DEFAULT_PATH);
        DistributedCache.addCacheFile(partitionUri, job.getConfiguration());
        DistributedCache.createSymlink(job.getConfiguration());
        System.out.println("Sampling finished");

        SequenceFileOutputFormat.setCompressOutput(job, true);
        SequenceFileOutputFormat.setOutputCompressorClass(job, com.hadoop.compression.lzo.LzoCodec.class);
        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

        jobOK = job.waitForCompletion(true);

        return jobOK;
    }

    protected boolean runTriplesJobWithOneJob()
            throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
        Job job = null;
        boolean jobOK;
        BufferedWriter bufferedWriter;

        // if input path does not exists, fail
        if (!this.inputFS.exists(this.conf.getInputPath())) {
            System.out.println("Dictionary input path does not exist: " + this.conf.getInputPath());
            System.exit(-1);
        }

        // if dictionary output path does not exists, fail
        if (!this.dictionaryFS.exists(this.conf.getInputPath())) {
            System.out.println("Dictionary output path does not exist: " + this.conf.getInputPath());
            System.exit(-1);
        }

        // if triples output path exists...
        if (this.triplesFS.exists(this.conf.getTriplesOutputPath())) {
            if (this.conf.getDeleteTriplesOutputPath()) { // ... and option provided, delete recursively
                this.triplesFS.delete(this.conf.getTriplesOutputPath(), true);
            } else { // ... and option not provided, fail
                System.out.println("Triples output path does exist: " + this.conf.getTriplesOutputPath());
                System.out.println("Select other path or use option -dt to overwrite");
                System.exit(-1);
            }
        }

        // Launch job
        this.conf.setProperty("mapred.child.java.opts",
                "-XX:ErrorFile=/home/hadoop/tmp/hs_err_pid%p.log -Xmx2500m");

        job = new Job(this.conf.getConfigurationObject(), this.conf.getDictionaryJobName());
        job.setJarByClass(HDTBuilderDriver.class);

        FileInputFormat.addInputPath(job, this.conf.getInputPath());
        FileOutputFormat.setOutputPath(job, this.conf.getTriplesOutputPath());

        job.setInputFormatClass(LzoTextInputFormat.class);
        LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

        job.setMapperClass(TriplesSPOMapper.class);
        job.setSortComparatorClass(TripleSPOComparator.class);
        job.setMapOutputKeyClass(TripleSPOWritable.class);
        job.setMapOutputValueClass(NullWritable.class);

        job.setNumReduceTasks(this.conf.getTriplesReducers());

        job.setOutputKeyClass(TripleSPOWritable.class);
        job.setOutputValueClass(NullWritable.class);

        DistributedCache.addCacheFile(this.conf.getDictionaryFile().toUri(), job.getConfiguration());
        // DistributedCache.addCacheFile(this.conf.getDictionaryMapFile().toUri(), job.getConfiguration());
        // DistributedCache.addCacheFile(this.conf.getDictionaryReduceFile().toUri(), job.getConfiguration());

        jobOK = job.waitForCompletion(true);

        this.numTriples = job.getCounters().findCounter(Counters.Triples).getValue();
        bufferedWriter = new BufferedWriter(
                new OutputStreamWriter(this.triplesFS.create(this.conf.getTriplesCountersFile())));
        bufferedWriter.write(this.numTriples.toString() + "\n");
        bufferedWriter.close();

        return jobOK;
    }

    protected boolean buidHDT() throws IOException {
        BufferedOutputStream output;
        TransientHDT hdt = new TransientHDT(this.conf.getSpec());
        TransientBitMapTriples triples = new TransientBitMapTriples(this.conf.getSpec(), this.triplesFS,
                new Path("temp"));

        // if dictionary not built, load it
        if (this.dictionary == null) {
            System.out.println("Dictionary not built. Reading data from " + this.conf.getDictionaryFile());
            this.dictionary = this.loadDictionary(this.dictionaryFS, this.conf.getDictionaryFile());
        }

        // if maxvalues not loaded, read Counters
        if (!this.conf.runDictionary()) {

            System.out.println("Dictionary Samples job not ran. Reading data from file.");

            BufferedReader reader = new BufferedReader(
                    new InputStreamReader(this.dictionaryFS.open(this.conf.getDictionaryCountersFile())));
            String line = reader.readLine();
            while (line != null) {
                String[] data = line.split("=");
                switch (data[0]) {
                case HDTBuilderConfiguration.SHARED:
                    this.numShared = Long.parseLong(data[1]);
                    break;
                case HDTBuilderConfiguration.SUBJECTS:
                    this.numSubjects = Long.parseLong(data[1]);
                    break;
                case HDTBuilderConfiguration.PREDICATES:
                    this.numPredicates = Long.parseLong(data[1]);
                    break;
                case HDTBuilderConfiguration.OBJECTS:
                    this.numObjects = Long.parseLong(data[1]);
                }
                line = reader.readLine();
            }
            reader.close();
        }

        // if triples job not ran, read Counters
        if (!this.conf.runTriples()) {
            System.out.println("Triples job nor ran. Reading data from " + this.conf.getTriplesCountersFile());
            BufferedReader reader = new BufferedReader(
                    new InputStreamReader(this.dictionaryFS.open(this.conf.getTriplesCountersFile())));
            this.numTriples = Long.parseLong(reader.readLine());
            reader.close();
        }

        this.loadFromDir(triples, this.numTriples, this.numPredicates, (this.numShared + this.numObjects),
                this.triplesFS, this.conf.getTriplesOutputPath());

        hdt.setDictionary(this.dictionary);
        hdt.setTriples(triples);
        hdt.populateHeaderStructure(this.conf.getBaseURI());

        output = new BufferedOutputStream(this.triplesFS.create(this.conf.getHDTFile()));
        hdt.saveToHDT(output, this.listener);
        output.close();

        return true;
    }

    protected void loadFromDir(TransientElement part, long numentries, FileSystem fs, Path path)
            throws IOException {
        PathFilter filter = new PathFilter() {
            @Override
            public boolean accept(Path path) {
                return !path.getName().startsWith("_");
            }
        };
        FileStatus[] status = fs.listStatus(path, filter);

        if (status.length == 0) {
            System.out.println("Path [" + path + "] has no files. Initializing section.");
            part.initialize(0);
        } else {
            Arrays.sort(status, new FileStatusComparator());

            System.out.println("Initializing section " + path);
            part.initialize(numentries);
            for (FileStatus file : status) {
                System.out.println("Reading file [" + file.getPath() + "]");
                SequenceFile.Reader reader = new SequenceFile.Reader(fs, file.getPath(),
                        this.conf.getConfigurationObject());
                part.load(reader, this.listener);
                reader.close();
            }
            System.out.println("Closing section " + path);
            part.close();
        }
    }

    protected void loadFromDir(TransientBitMapTriples part, long numentries, long maxpredicate, long maxobject,
            FileSystem fs, Path path) throws IOException {
        PathFilter filter = new PathFilter() {
            @Override
            public boolean accept(Path path) {
                return !path.getName().startsWith("_");
            }
        };
        FileStatus[] status = fs.listStatus(path, filter);

        if (status.length == 0) {
            System.out.println("Path [" + path + "] has no files. Initializing section.");
            part.initialize(0, 0);
        } else {
            Arrays.sort(status, new FileStatusComparator());

            System.out.println("Initializing section " + path);
            part.initialize(numentries, maxpredicate, maxobject);
            for (FileStatus file : status) {
                System.out.println("Reading file [" + file.getPath() + "]");
                SequenceFile.Reader reader = new SequenceFile.Reader(fs, file.getPath(),
                        this.conf.getConfigurationObject());
                part.load(reader, this.listener);
                reader.close();
            }
            System.out.println("Closing section " + path);
            part.close();
        }
    }

    protected FourSectionDictionary2 loadDictionary(FileSystem fs, Path dictionaryPath) throws IOException {
        BufferedInputStream input = new BufferedInputStream(fs.open(dictionaryPath));
        FourSectionDictionary2 dictionary = new FourSectionDictionary2(this.conf.getSpec());
        ControlInformation ci = new ControlInformation();
        ci.clear();
        ci.load(input);
        dictionary.load(input, ci, this.listener);
        return dictionary;
    }

    protected void saveDictionary(FourSectionDictionary2 dictionary, FileSystem fs, Path dictionaryPath)
            throws IOException {
        BufferedOutputStream output = new BufferedOutputStream(fs.create(dictionaryPath));
        dictionary.save(output, new ControlInformation(), this.listener);
        output.close();
    }

}