Java tutorial
/* * Copyright (C) 2013 argonet.co.kr <ddoleye@gmail.com> * * This library is free software: you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * See the GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ /* * ? ? ?(.hwp) . * * ? ?? . * https://github.com/cogniti/ruby-hwp/ * https://github.com/cogniti/libghwp/ */ package com.argo.hwp.v5; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.Writer; import java.util.Arrays; import java.util.Iterator; import java.util.zip.Inflater; import java.util.zip.InflaterInputStream; import org.apache.poi.poifs.filesystem.DirectoryEntry; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.DocumentEntry; import org.apache.poi.poifs.filesystem.DocumentInputStream; import org.apache.poi.poifs.filesystem.Entry; import org.apache.poi.poifs.filesystem.NDocumentInputStream; import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; import org.apache.poi.util.LittleEndian; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.argo.hwp.utils.HwpStreamReader; public abstract class HwpTextExtractorV5 { protected static Logger log = LoggerFactory.getLogger(HwpTextExtractorV5.class); private static final byte[] HWP_V5_SIGNATURE = "HWP Document File".getBytes(); private static final int[] HWP_CONTROL_CHARS = new int[] { 0, 10, 13, 24, 25, 26, 27, 28, 29, 30, 31 }; private static final int[] HWP_INLINE_CHARS = new int[] { 4, 5, 6, 7, 8, 9, 19, 20 }; private static final int[] HWP_EXTENDED_CHARS = new int[] { 1, 2, 3, 11, 12, 14, 15, 16, 17, 18, 21, 22, 23 }; private static final int HWPTAG_BEGIN = 0x010; /** * HWP ?? ? * * @param source * @param writer * @return * @throws FileNotFoundException * @throws IOException */ public static boolean extractText(File source, Writer writer) throws FileNotFoundException, IOException { if (source == null) throw new IllegalArgumentException(); if (!source.exists()) throw new FileNotFoundException(); NPOIFSFileSystem fs = null; try { FileHeader header; // HWP Document ? try { // ? Compound File fs = new NPOIFSFileSystem(source); header = getHeader(fs); } catch (IOException e) { log.warn("? ? . HWP ?? ", e); return false; } if (header == null) return false; // TODO ? .. BodyText ViewText? Section ? // ? ? ? ? ? // https://groups.google.com/forum/#!topic/libhwp/raZpuBS2BX4 if (header.viewtext) { log.warn( "? ? . https://groups.google.com/forum/#!topic/libhwp/raZpuBS2BX4"); // return or throw exception return false; } // HWP // ?? IOException ? HWP ? ?. extractText(header, fs, writer); return true; } finally { if (fs != null) { try { fs.close(); } catch (IOException e) { log.warn("Exception", e); } } } } /** * HWP? FileHeader * * @param fs * @return * @throws IOException */ private static FileHeader getHeader(NPOIFSFileSystem fs) throws IOException { DirectoryNode root = fs.getRoot(); // ??? p.18 // FileHeader Entry headerEntry = root.getEntry("FileHeader"); if (!headerEntry.isDocumentEntry()) return null; // ? byte[] header = new byte[256]; // FileHeader ? 256 DocumentInputStream headerStream = new DocumentInputStream((DocumentEntry) headerEntry); try { int read = headerStream.read(header); if (read != 256 || !Arrays.equals(HWP_V5_SIGNATURE, Arrays.copyOfRange(header, 0, HWP_V5_SIGNATURE.length))) return null; } finally { headerStream.close(); } FileHeader fileHeader = new FileHeader(); // . debug fileHeader.version = HwpVersion.parseVersion(LittleEndian.getUInt(header, 32)); long flags = LittleEndian.getUInt(header, 36); log.debug("Flags={}", Long.toBinaryString(flags).replace(' ', '0')); fileHeader.compressed = (flags & 0x01) == 0x01; fileHeader.encrypted = (flags & 0x02) == 0x02; fileHeader.viewtext = (flags & 0x04) == 0x04; return fileHeader; } /** * ? * * @param writer * @param source * * @return * @throws IOException */ private static void extractText(FileHeader header, NPOIFSFileSystem fs, Writer writer) throws IOException { DirectoryNode root = fs.getRoot(); // BodyText ? Entry bodyText = root.getEntry("BodyText"); if (bodyText == null || !bodyText.isDirectoryEntry()) throw new IOException("Invalid BodyText"); Iterator<Entry> iterator = ((DirectoryEntry) bodyText).getEntries(); while (iterator.hasNext()) { Entry entry = iterator.next(); if (entry.getName().startsWith("Section") && entry instanceof DocumentEntry) { log.debug("extract {}", entry.getName()); InputStream input = new NDocumentInputStream((DocumentEntry) entry); if (header.compressed) input = new InflaterInputStream(input, new Inflater(true)); HwpStreamReader sectionStream = new HwpStreamReader(input); try { extractText(sectionStream, writer); } finally { // ? ? ? try { input.close(); } catch (IOException e) { log.error("? ??", e); } } } else { log.warn(" Entry '{}'({})", entry.getName(), entry); } } } /** * Section ? ? * * @param sectionStream * @param writer * @throws IOException */ private static void extractText(HwpStreamReader sectionStream, Writer writer) throws IOException { StringBuffer buf = new StringBuffer(1024); TagInfo tag = new TagInfo(); while (true) { if (!readTag(sectionStream, tag)) break; buf.setLength(0); if (HWPTAG_BEGIN + 50 == tag.id) { writeParaHeader(sectionStream, tag.length, buf); } else if (HWPTAG_BEGIN + 51 == tag.id) { if (tag.length % 2 != 0) throw new IOException("Invalid block size"); writeParaText(sectionStream, tag.length, buf); if (buf.length() > 0) // ? writer.append(buf.toString()).append('\n'); } else { sectionStream.ensureSkip(tag.length); } if (buf.length() > 0) { log.debug("TAG[{}]({}):{} [{}]", new Object[] { tag.id, tag.level, tag.length, buf }); } } } private static void writeParaHeader(HwpStreamReader sectionStream, long length, StringBuffer buf) throws IOException { // log.debug("text={}", sectionStream.uint32()); // log.debug("control mask={}", sectionStream.uint32()); // log.debug("?={}", sectionStream.uint16()); // log.debug("??={}", sectionStream.uint8()); // log.debug("={}", sectionStream.uint8()); // log.debug("?={}", sectionStream.uint16()); // log.debug("range tag={}", sectionStream.uint16()); // log.debug("?? align={}", sectionStream.uint16()); // log.debug(" Instance ID={}", sectionStream.uint32()); // sectionStream.ensureSkip(2); sectionStream.ensureSkip(length); } /** * HWPTAG_PARA_TEXT ? ?? ? * * @param sectionStream * @param datasize * @param buf * @throws IOException */ private static void writeParaText(HwpStreamReader sectionStream, long datasize, StringBuffer buf) throws IOException { int[] chars = sectionStream.uint16((int) (datasize / 2)); for (int index = 0; index < chars.length; index++) { int ch = chars[index]; if (Arrays.binarySearch(HWP_INLINE_CHARS, ch) >= 0) { if (ch == 9) { buf.append('\t'); } index += 7; } else if (Arrays.binarySearch(HWP_EXTENDED_CHARS, ch) >= 0) { index += 7; } else if (Arrays.binarySearch(HWP_CONTROL_CHARS, ch) >= 0) { buf.append(' '); } else { buf.append((char) ch); } } } private static boolean readTag(HwpStreamReader sectionStream, TagInfo tag) throws IOException { // p.24 long recordHeader = sectionStream.uint32(); if (recordHeader == -1) return false; // log.debug("Record Header={} [{}]", recordHeader, // Long.toHexString(recordHeader)); tag.id = recordHeader & 0x3FF; tag.level = (recordHeader >> 10) & 0x3FF; tag.length = (recordHeader >> 20) & 0xFFF; // ?? p.24 if (tag.length == 0xFFF) tag.length = sectionStream.uint32(); return true; } static class FileHeader { HwpVersion version; boolean compressed; // bit 0 boolean encrypted; // bit 1 boolean viewtext; // bit 2 } static class TagInfo { long id; long level; long length; } static class HwpVersion { int m; int n; int p; int r; public String toString() { return String.format("%d.%d.%d.%d", m, n, p, r); } public static HwpVersion parseVersion(long longVersion) { HwpVersion version = new HwpVersion(); version.m = (int) ((longVersion & 0xFF000000L) >> 24); version.n = (int) ((longVersion & 0x00FF0000L) >> 16); version.p = (int) ((longVersion & 0x0000FF00L) >> 8); version.r = (int) ((longVersion & 0x000000FFL)); return version; } } }