com.tekstosense.segmenter.Main.java Source code

Java tutorial

Introduction

Here is the source code for com.tekstosense.segmenter.Main.java

Source

/*******************************************************************************
 * Copyright (c) 2016, TekstoSense and/or its affiliates. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
package com.tekstosense.segmenter;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;

import com.tekstosense.segmenter.model.DoublyLinkedList;
import com.tekstosense.segmenter.model.LineNode;

import org.kohsuke.args4j.Argument;
import org.kohsuke.args4j.CmdLineException;
import org.kohsuke.args4j.CmdLineParser;
import org.kohsuke.args4j.Option;

import javax.imageio.ImageIO;

import java.awt.image.BufferedImage;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.List;

public class Main extends PDFTextStripper {

    @Option(name = "-p", usage = "Specify an optional decryption password.", required = false, multiValued = false, metaVar = "PASSWD")
    private String password = "";

    @Option(name = "-m", usage = "Also create a text location mask image.", required = false, multiValued = false, metaVar = "PAGENO")
    private int maskImagePage = 0;

    @Argument
    private List<String> filenames = new ArrayList<String>();

    /**
     * Instantiate a new PDFTextStripper object.
     *
     * @throws IOException If there is an error loading the properties.
     */
    public Main() throws IOException {
    }

    private TextExtractor parsePdf(File f) throws IOException {
        PDDocument doc = PDDocument.load(f);

        if (doc.isEncrypted()) {
            // Some documents are encrypted with the empty password. Try
            // to decrypt with this password, or the one passed in on the
            // command line (if any), and fail if we can't.
            try {
                doc.setAllSecurityToBeRemoved(false);
                //doc.decrypt(password); // Defaults to the empty string.
            } catch (Exception e) {
                throw new IOException("Can't decrypt document: ", e);
            }
        }
        TextExtractor te = new TextExtractor();
        te.writeText(doc, new OutputStreamWriter(new ByteArrayOutputStream()));

        return te;
    }

    private void doMain() {
        for (String filename : filenames) {
            File inputFile = new File(filename);
            TextExtractor te = null;

            try {
                te = parsePdf(inputFile);
                DoublyLinkedList<LineNode> sentenceList = te.toLinkedList();
                sentenceList.displayList();
            } catch (IOException e) {
                System.err.println("Couldn't read file '" + inputFile + "'.");
                System.exit(1);
            }

            try {
                if (maskImagePage > 0) {
                    BufferedImage maskImage = te.toMaskImage(maskImagePage);
                    ImageIO.write(maskImage, "png", new File("mask.png"));
                }
            } catch (IOException e) {
                System.err.println("Couldn't write mask image 'mask.png'.");
                System.exit(1);
            }

        }
    }

    public static void main(String[] args) throws IOException {
        Main m = new Main();
        CmdLineParser parser = new CmdLineParser(m);

        try {
            parser.parseArgument(args);

            if (m.filenames.size() == 0) {
                System.err.println("Usage: pdf2xml [options] <FILEs>");
                parser.printUsage(System.err);
            } else {
                m.doMain();
            }
        } catch (CmdLineException e) {
            parser.printUsage(System.err);
        }
    }
}