com.xx.platform.util.tools.ms.MSExtractor.java Source code

Java tutorial

Introduction

Here is the source code for com.xx.platform.util.tools.ms.MSExtractor.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.xx.platform.util.tools.ms;

// JDK imports
import java.io.InputStream;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;

/**
 * Defines a Microsoft document content extractor.
 *
 * @author Jérôme Charron
 */
public abstract class MSExtractor {

    protected final static Log LOG = LogFactory.getLog(MSExtractor.class);

    //  private String text = null;
    //  private POIFSReader reader = null;
    //  private PropertiesBroker properties = null;

    /** Constructs a new Microsoft document extractor. */
    protected MSExtractor() {
    }

    public String extractText(InputStream input) throws Exception {
        return extractText(new POIFSFileSystem(input));
    }

    /**
     * Extracts the text content from a Microsoft document input stream.
     */
    public abstract String extractText(POIFSFileSystem poifs) throws Exception;

    //  private final static class PropertiesBroker {
    //
    //    private final static int TIMEOUT = 2 * 1000;
    //    private Properties properties = null;
    //
    //    public synchronized Properties getProperties() {
    //
    //      final long start = new Date().getTime();
    //      long now = start;
    //
    //      while (this.properties == null && now - start < TIMEOUT) {
    //        try {
    //          wait(TIMEOUT / 10);
    //        } catch (InterruptedException e) {
    //        }
    //        now = new Date().getTime();
    //      }
    //      notifyAll();
    //      return this.properties;
    //    }
    //
    //    public synchronized void setProperties(Properties properties) {
    //      this.properties = properties;
    //      notifyAll();
    //    }
    //  }

    //  private class PropertiesReaderListener implements POIFSReaderListener {
    //
    //    private PropertiesBroker propertiesBroker;
    //    private Properties metadata = new Properties();
    //
    //    PropertiesReaderListener(PropertiesBroker propertiesBroker) {
    //      this.propertiesBroker = propertiesBroker;
    //    }
    //
    //    public void processPOIFSReaderEvent(POIFSReaderEvent event) {
    //      if (!event.getName().startsWith(SummaryInformation.DEFAULT_STREAM_NAME)) {
    //        return;
    //      }
    //
    //      try {
    //        SummaryInformation si = (SummaryInformation)
    //                                  PropertySetFactory.create(event.getStream());
    //        setProperty(DublinCore.TITLE, si.getTitle());
    //        setProperty(Office.APPLICATION_NAME, si.getApplicationName());
    //        setProperty(Office.AUTHOR, si.getAuthor());
    //        setProperty(Office.CHARACTER_COUNT, si.getCharCount());
    //        setProperty(Office.COMMENTS, si.getComments());
    //        setProperty(DublinCore.DATE, si.getCreateDateTime());
    ////        setProperty(Office.EDIT_TIME, si.getEditTime());
    //        setProperty(HttpHeaders.LAST_MODIFIED, si.getLastSaveDateTime());
    //        setProperty(Office.KEYWORDS, si.getKeywords());
    //        setProperty(Office.LAST_AUTHOR, si.getLastAuthor());
    //        setProperty(Office.LAST_PRINTED, si.getLastPrinted());
    //        setProperty(Office.LAST_SAVED, si.getLastSaveDateTime());
    //        setProperty(Office.PAGE_COUNT, si.getPageCount());
    //        setProperty(Office.REVISION_NUMBER, si.getRevNumber());
    //        setProperty(DublinCore.RIGHTS, si.getSecurity());
    //        setProperty(DublinCore.SUBJECT, si.getSubject());
    //        setProperty(Office.TEMPLATE, si.getTemplate());
    //        setProperty(Office.WORD_COUNT, si.getWordCount());
    //      } catch (Exception ex) {
    //      }
    //      propertiesBroker.setProperties(metadata);
    //    }
    //
    //    private final void setProperty(String name, String value) {
    //      if (!StringUtil.isEmpty(name) && !StringUtil.isEmpty(value)) {
    //        metadata.setProperty(name, value);
    //      }
    //    }
    //
    //    private final void setProperty(String name, int value) {
    //      if (value != 0) {
    //        setProperty(name, String.valueOf(value));
    //      }
    //    }
    //
    //    private final void setProperty(String name, long value) {
    //      if (value != 0) {
    //        setProperty(name, String.valueOf(value));
    //      }
    //    }
    //
    //    private final void setProperty(String name, Date date) {
    //      if (date != null) {
    //        setProperty(name, HttpDateFormat.toString(date));
    //      }
    //    }
    //
    //  }

}