pl.edu.icm.cermine.bx.DocumentBibZonesCorrector.java Source code

Java tutorial

Introduction

Here is the source code for pl.edu.icm.cermine.bx.DocumentBibZonesCorrector.java

Source

/**
 * This file is part of CERMINE project.
 * Copyright (c) 2011-2013 ICM-UW
 *
 * CERMINE is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * CERMINE is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with CERMINE. If not, see <http://www.gnu.org/licenses/>.
 */

package pl.edu.icm.cermine.bx;

import com.google.common.collect.Lists;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.List;
import org.apache.commons.cli.*;
import org.apache.commons.io.FileUtils;
import pl.edu.icm.cermine.exception.AnalysisException;
import pl.edu.icm.cermine.exception.TransformationException;
import pl.edu.icm.cermine.structure.model.BxDocument;
import pl.edu.icm.cermine.structure.model.BxPage;
import pl.edu.icm.cermine.structure.model.BxZone;
import pl.edu.icm.cermine.structure.model.BxZoneLabel;
import pl.edu.icm.cermine.structure.transformers.BxDocumentToTrueVizWriter;
import pl.edu.icm.cermine.structure.transformers.TrueVizToBxDocumentReader;

public class DocumentBibZonesCorrector {

    public static void main(String[] args) throws TransformationException, IOException, AnalysisException,
            ParseException, CloneNotSupportedException {
        Options options = new Options();
        options.addOption("input", true, "input path");
        options.addOption("output", true, "output path");
        CommandLineParser parser = new GnuParser();
        CommandLine line = parser.parse(options, args);
        String inDir = line.getOptionValue("input");
        String outDir = line.getOptionValue("output");

        File dir = new File(inDir);

        for (File f : FileUtils.listFiles(dir, new String[] { "xml" }, true)) {
            TrueVizToBxDocumentReader tvReader = new TrueVizToBxDocumentReader();
            List<BxPage> pages = tvReader.read(new FileReader(f));
            BxDocument doc = new BxDocument().setPages(pages);
            doc.setFilename(f.getName());

            for (BxZone z : doc.asZones()) {
                if (!BxZoneLabel.MET_BIB_INFO.equals(z.getLabel()) && !BxZoneLabel.REFERENCES.equals(z.getLabel())
                        && z.childrenCount() <= 2
                        && (z.toText().toLowerCase().contains("journal ")
                                || z.toText().toLowerCase().contains("vol.")
                                || z.toText().toLowerCase().contains("vol ")
                                || z.toText().toLowerCase().contains("pp.")
                                || z.toText().toLowerCase().contains("volume ")
                                || z.toText().toLowerCase().contains("pp ")
                                || z.toText().toLowerCase().contains("issn")
                                || z.toText().toLowerCase().contains("doi:")
                                || z.toText().toLowerCase().contains("doi ")
                                || z.toText().toLowerCase().contains("citation:"))) {
                    System.out.println("DETECTED BIBINFO: ");
                    System.out.println(z.getLabel() + " " + z.toText());
                    System.out.println("");
                    z.setLabel(BxZoneLabel.MET_BIB_INFO);
                } else if (!BxZoneLabel.OTH_UNKNOWN.equals(z.getLabel())
                        && !BxZoneLabel.MET_BIB_INFO.equals(z.getLabel()) && z.childrenCount() <= 2
                        && (z.toText().toLowerCase().contains("page "))) {
                    System.out.println("DETECTED PAGE: ");
                    System.out.println(z.getLabel() + " " + z.toText());
                    System.out.println("");
                    z.setLabel(BxZoneLabel.OTH_UNKNOWN);
                }
            }

            File f2 = new File(outDir + doc.getFilename());
            BxDocumentToTrueVizWriter wrt = new BxDocumentToTrueVizWriter();
            boolean created = f2.createNewFile();
            if (!created) {
                throw new IOException("Cannot create file: ");
            }
            FileWriter fw = new FileWriter(f2);
            wrt.write(fw, Lists.newArrayList(doc));
            fw.flush();
            fw.close();
        }
    }
}