org.apache.jackrabbit.oak.plugins.blob.datastore.DataStoreTextWriter.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.jackrabbit.oak.plugins.blob.datastore.DataStoreTextWriter.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.jackrabbit.oak.plugins.blob.datastore;

import java.io.BufferedWriter;
import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.lang.ref.SoftReference;
import java.util.Set;
import java.util.concurrent.Callable;

import javax.annotation.Nonnull;

import com.google.common.base.Charsets;
import com.google.common.collect.Sets;
import com.google.common.io.Files;
import org.apache.commons.io.FileUtils;
import org.apache.jackrabbit.oak.api.Blob;
import org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText;
import org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText.ExtractionResult;
import org.apache.jackrabbit.oak.plugins.index.fulltext.PreExtractedTextProvider;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.base.Preconditions.checkState;

/**
 * TextWriter implementation which just stores the extracted text
 * as files using the same layout as used by FileDataStore
 */
public class DataStoreTextWriter implements TextWriter, Closeable, PreExtractedTextProvider {
    private static final String ERROR_BLOB_FILE = "blobs_error.txt";
    private static final String EMPTY_BLOB_FILE = "blobs_empty.txt";

    private static final Logger log = LoggerFactory.getLogger(DataStoreTextWriter.class);
    private File directory;

    private final SetHolder emptyBlobsHolder;
    private final SetHolder errorBlobsHolder;
    private boolean closed;
    /**
     * Flag indicating that blobId passed is one from DataStoreBlobStore
     * As those blobId's have the length encoded which would need to be
     * stripped of
     */
    private boolean dataStoreBlobId = true;

    private final boolean readOnlyMode;

    public DataStoreTextWriter(File directory, boolean readOnlyMode) throws IOException {
        if (!directory.exists()) {
            checkArgument(directory.mkdirs(), "Cannot create directory %s", directory.getAbsolutePath());
        }
        this.directory = directory;
        this.readOnlyMode = readOnlyMode;
        this.emptyBlobsHolder = new SetHolder(createLoader(EMPTY_BLOB_FILE), readOnlyMode);
        this.errorBlobsHolder = new SetHolder(createLoader(ERROR_BLOB_FILE), readOnlyMode);

        if (!readOnlyMode) {
            log.info("Using {} to store the extracted text content. Empty count {}, Error count {}",
                    directory.getAbsolutePath(), getEmptyBlobs().size(), getErrorBlobs().size());
        } else {
            log.info("Using extracted store from {}", directory.getAbsolutePath());
        }
    }

    @Override
    public ExtractedText getText(String propertyPath, Blob blob) throws IOException {
        String blobId = blob.getContentIdentity();
        if (blobId == null) {
            log.debug("No id found for blob at path {}", propertyPath);
            return null;
        }

        blobId = stripLength(blobId);
        ExtractedText result = null;
        if (getEmptyBlobs().contains(blobId)) {
            result = ExtractedText.EMPTY;
        } else if (getErrorBlobs().contains(blobId)) {
            result = ExtractedText.ERROR;
        } else {
            File textFile = getFile(blobId);
            if (textFile.exists()) {
                String text = Files.toString(textFile, Charsets.UTF_8);
                result = new ExtractedText(ExtractionResult.SUCCESS, text);
            }
        }

        if (log.isDebugEnabled()) {
            String extractionResult = result != null ? result.getExtractionResult().toString() : null;
            log.debug("Extraction result for [{}] at path [{}] is [{}]", blobId, propertyPath, extractionResult);
        }
        return result;
    }

    @Override
    public void write(@Nonnull String blobId, @Nonnull String text) throws IOException {
        checkIfReadOnlyModeEnabled();
        checkNotNull(blobId, "BlobId cannot be null");
        checkNotNull(text, "Text passed for [%s] was null", blobId);

        File textFile = getFile(stripLength(blobId));
        ensureParentExists(textFile);
        //TODO should we compress
        Files.write(text, textFile, Charsets.UTF_8);
    }

    @Override
    public synchronized void markEmpty(String blobId) {
        checkIfReadOnlyModeEnabled();
        getEmptyBlobs().add(stripLength(blobId));
    }

    @Override
    public synchronized void markError(String blobId) {
        checkIfReadOnlyModeEnabled();
        getErrorBlobs().add(stripLength(blobId));
    }

    @Override
    public synchronized boolean isProcessed(String blobId) {
        blobId = stripLength(blobId);
        if (getEmptyBlobs().contains(blobId) || getErrorBlobs().contains(blobId)) {
            return true;
        }
        File textFile = getFile(blobId);
        return textFile.exists();
    }

    @Override
    public synchronized void close() throws IOException {
        if (closed || readOnlyMode) {
            return;
        }
        writeToFile(EMPTY_BLOB_FILE, getEmptyBlobs());
        writeToFile(ERROR_BLOB_FILE, getErrorBlobs());
        closed = true;
    }

    @Override
    public String toString() {
        return "FileDataStore based text provider";
    }

    SetHolder getEmptyBlobsHolder() {
        return emptyBlobsHolder;
    }

    SetHolder getErrorBlobsHolder() {
        return errorBlobsHolder;
    }

    /**
     * Returns the identified file. This method implements the pattern
     * used to avoid problems with too many files in a single directory.
     * <p/>
     * No sanity checks are performed on the given identifier.
     *
     * @param identifier file name
     * @return identified file
     */
    private File getFile(String identifier) {
        File file = directory;
        file = new File(file, identifier.substring(0, 2));
        file = new File(file, identifier.substring(2, 4));
        file = new File(file, identifier.substring(4, 6));
        return new File(file, identifier);
    }

    private String stripLength(String blobId) {
        if (dataStoreBlobId) {
            return DataStoreBlobStore.BlobId.of(blobId).blobId;
        }
        return blobId;
    }

    private Set<String> getEmptyBlobs() {
        return emptyBlobsHolder.get();
    }

    private Set<String> getErrorBlobs() {
        return errorBlobsHolder.get();
    }

    private void checkIfReadOnlyModeEnabled() {
        checkState(!readOnlyMode, "Read only mode enabled");
    }

    private Callable<Set<String>> createLoader(final String fileName) {
        final File file = new File(directory, fileName);
        return new Callable<Set<String>>() {
            @Override
            public Set<String> call() throws Exception {
                return loadFromFile(file);
            }

            @Override
            public String toString() {
                return "Loading state from " + file.getAbsolutePath();
            }
        };
    }

    private Set<String> loadFromFile(File file) throws IOException {
        Set<String> result = Sets.newHashSet();
        if (file.exists()) {
            result.addAll(Files.readLines(file, Charsets.UTF_8));
        }
        return result;
    }

    private void writeToFile(String fileName, Set<String> blobIds) throws IOException {
        if (blobIds.isEmpty()) {
            return;
        }
        File file = new File(directory, fileName);
        BufferedWriter bw = Files.newWriter(file, Charsets.UTF_8);
        for (String id : blobIds) {
            bw.write(id);
            bw.newLine();
        }
        bw.close();
    }

    private static void ensureParentExists(File file) throws IOException {
        if (!file.exists()) {
            File parent = file.getParentFile();
            FileUtils.forceMkdir(parent);
        }
    }

    /**
     * While running in read only mode the PreExtractedTextProvider
     * would only be used while reindexing. So as to avoid holding memory
     * SoftReference would be used
     */
    static class SetHolder {
        private final Set<String> state;
        private SoftReference<Set<String>> stateRef;
        private final Callable<Set<String>> loader;
        private int loadCount;

        public SetHolder(Callable<Set<String>> loader, boolean softRef) {
            this.loader = loader;
            if (softRef) {
                this.state = null;
            } else {
                this.state = load();
            }
        }

        public Set<String> get() {
            Set<String> result = state;
            if (result != null) {
                return result;
            }

            if (stateRef != null) {
                result = stateRef.get();
            }

            if (result == null) {
                result = load();
                stateRef = new SoftReference<Set<String>>(result);
            }

            return result;
        }

        public int getLoadCount() {
            return loadCount;
        }

        private Set<String> load() {
            try {
                loadCount++;
                return loader.call();
            } catch (Exception e) {
                log.warn("Error occurred while loading the state via {}", loader, e);
                return Sets.newHashSet();
            }
        }
    }
}