Java tutorial
package fr.inra.maiage.bibliome.util.pubmed; import java.io.BufferedReader; import java.io.File; import java.io.FileFilter; import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; import java.net.URISyntaxException; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; import fr.inra.maiage.bibliome.util.Iterators; import fr.inra.maiage.bibliome.util.Strings; import fr.inra.maiage.bibliome.util.clio.CLIOException; import fr.inra.maiage.bibliome.util.clio.CLIOParser; import fr.inra.maiage.bibliome.util.clio.CLIOption; import fr.inra.maiage.bibliome.util.defaultmap.DefaultArrayListHashMap; import fr.inra.maiage.bibliome.util.defaultmap.DefaultMap; import fr.inra.maiage.bibliome.util.streams.CollectionSourceStream; import fr.inra.maiage.bibliome.util.streams.CompressionFilter; import fr.inra.maiage.bibliome.util.streams.SourceStream; import fr.inra.maiage.bibliome.util.streams.StreamFactory; import fr.inra.maiage.bibliome.util.xml.XMLUtils; public class PubMedIndexUpdater extends CLIOParser { public static final Pattern PUBMED_FILENAME_PATTERN = Pattern.compile("pubmed\\d+n\\d+\\.xml(?:\\.gz)?"); private static final String LOCATION_PUBMED_BASELINE = "ftp://ftp.ncbi.nlm.nih.gov/pubmed/baseline/"; private static final String LOCATION_PUBMED_UPDATEFILES = "ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/"; private static final String LOCATION_PUBMED_OPEN = "ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_file_list.txt"; private static class PubMedFileFilter implements FileFilter { private PubmedIndexProperties properties = null; private boolean checkIndexedFile = true; @Override public boolean accept(File file) { String name = getFilename(file.getAbsolutePath()); Matcher m = PUBMED_FILENAME_PATTERN.matcher(name); if (!m.matches()) { return false; } if (checkIndexedFile && properties.isIndexedFile(name)) { PubMedIndexUtils.log("skipping: %s", file); return false; } return true; } } private final PubMedFileFilter fileFilter = new PubMedFileFilter(); private final StreamFactory streamFactory = new StreamFactory(); private File indexDir; private final Collection<SourceStream> sources = new ArrayList<SourceStream>(); private final DefaultMap<String, List<String>> meshPaths = new DefaultArrayListHashMap<String, String>(); private final Map<String, String> openLicenses = new HashMap<String, String>(); public PubMedIndexUpdater() { super(); streamFactory.setCompressionFilter(CompressionFilter.FILE_EXTENSION); streamFactory.setRecursive(true); streamFactory.setFilter(fileFilter); } @CLIOption(stop = true, value = "-help") public void help() { System.out.println(usage()); } @CLIOption("-force") public void force() { fileFilter.checkIndexedFile = false; } @CLIOption("-index") public void setIndexDir(File indexDir) { this.indexDir = indexDir; } @CLIOption("-mesh-tree") public void addMeSHRoots(String meshTreeLocation) throws IOException, URISyntaxException { StreamFactory streamFactory = new StreamFactory(); streamFactory.setCharset("UTF-16"); SourceStream source = streamFactory.getSourceStream(meshTreeLocation); PubMedIndexUtils.log("reading MeSH descriptor tree from %s", meshTreeLocation); boolean isHeaderLine = true; try (BufferedReader r = source.getBufferedReader()) { while (true) { String line = r.readLine(); if (line == null) { break; } if (isHeaderLine) { isHeaderLine = false; continue; } line = line.trim(); int tab = line.indexOf('\t'); String meshPath = line.substring(0, tab); String rest = line.substring(tab + 1); tab = rest.indexOf('\t'); String meshId = rest.substring(0, tab); meshPaths.safeGet(meshId).add(meshPath); } } } @CLIOption("-mesh-tree-xml") public void addMeSHRootsXML(String meshTreeLocation) throws SAXException, IOException, ParserConfigurationException { SAXParserFactory spf = SAXParserFactory.newInstance(); SAXParser parser = spf.newSAXParser(); PubMedIndexUtils.log("reading MeSH descriptors from %s", meshTreeLocation); parser.parse(meshTreeLocation, meshTreeHandler); } private final DefaultHandler meshTreeHandler = new DefaultHandler() { private String ui = null; private boolean inDescriptorUI = false; private boolean inTreeNumber = false; @Override public void endDocument() throws SAXException { super.endDocument(); ui = null; inDescriptorUI = false; inTreeNumber = false; } @Override public void startDocument() throws SAXException { super.startDocument(); ui = null; inDescriptorUI = false; inTreeNumber = false; } @Override public void characters(char[] ch, int start, int length) throws SAXException { super.characters(ch, start, length); if (inDescriptorUI) { ui = new String(ch, start, length); } if (inTreeNumber) { String path = new String(ch, start, length); meshPaths.safeGet(ui).add(path); } } @Override public void endElement(String uri, String localName, String qName) throws SAXException { super.endElement(uri, localName, qName); inDescriptorUI = false; inTreeNumber = false; } @Override public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { super.startElement(uri, localName, qName, attributes); switch (qName) { case "DescriptorRecord": { ui = null; break; } case "DescriptorUI": { inDescriptorUI = (ui == null); break; } case "TreeNumber": { inTreeNumber = true; break; } } } }; @CLIOption("-baseline") public void downloadBaseline() throws MalformedURLException, IOException { PubMedIndexUtils.log("downloading baseline file list: %s", LOCATION_PUBMED_BASELINE); SourceStream source = new PubMedListingSourceStream(LOCATION_PUBMED_BASELINE, fileFilter); sources.add(source); } @CLIOption("-update-files") public void downloadUpdateFiles() throws MalformedURLException, IOException { PubMedIndexUtils.log("downloading update file list: %s", LOCATION_PUBMED_UPDATEFILES); SourceStream source = new PubMedListingSourceStream(LOCATION_PUBMED_UPDATEFILES, fileFilter); sources.add(source); } @CLIOption("-open-access") public void indexOpenAccessStatus() throws IOException, URISyntaxException { PubMedIndexUtils.log("downloading open access list: %s", LOCATION_PUBMED_OPEN); SourceStream source = streamFactory.getSourceStream(LOCATION_PUBMED_OPEN); boolean firstLine = true; try (BufferedReader r = source.getBufferedReader()) { while (true) { String line = r.readLine(); if (line == null) { break; } if (firstLine) { firstLine = false; // skip first line that contains a date continue; } List<String> cols = Strings.split(line, '\t', -1); String pmid = cols.get(3); if (pmid.isEmpty()) { continue; } if (!pmid.startsWith("PMID:")) { continue; } pmid = pmid.substring(5); String license = cols.get(4); openLicenses.put(pmid, license); } } } @Override protected boolean processArgument(String arg) throws CLIOException { try { SourceStream stream = streamFactory.getSourceStream(arg); sources.add(stream); return false; } catch (IOException | URISyntaxException e) { throw new CLIOException(e); } } @Override public String getResourceBundleName() { return PubMedIndexUpdater.class.getCanonicalName() + "Help"; } public void update() throws CorruptIndexException, IOException, ParserConfigurationException, SAXException { try (IndexWriter indexWriter = openIndexWriter(indexDir)) { SAXParser parser = createParser(); PubMedIndexDOMBuilderHandler handler = new PubMedIndexDOMBuilderHandler(XMLUtils.docBuilder, indexWriter, meshPaths, openLicenses); PubmedIndexProperties properties = new PubmedIndexProperties(indexWriter); fileFilter.properties = properties; SourceStream source = new CollectionSourceStream("UTF-8", sources); for (InputStream is : Iterators.loop(source.getInputStreams())) { String streamName = source.getStreamName(is); String filename = getFilename(streamName); PubMedIndexUtils.log("parsing and indexing: %s", filename); handler.resetCounts(); handler.setSource(filename); parser.parse(is, handler); properties.addIndexedFile(filename); properties.update(indexWriter); indexWriter.commit(); PubMedIndexUtils.log("citations updated: %d", handler.getUpdatedCitationsCount()); PubMedIndexUtils.log("citations deleted: %d", handler.getDeletedCitationsCount()); } } } private static String getFilename(String streamName) { int slash = streamName.lastIndexOf(File.separatorChar); String filename = streamName.substring(slash + 1); return filename.replace(".gz", ""); } private static SAXParser createParser() throws ParserConfigurationException, SAXException { SAXParserFactory pf = SAXParserFactory.newInstance(); return pf.newSAXParser(); } private static IndexWriterConfig getIndexWriterConfig() { Analyzer analyzer = PubMedIndexUtils.getGlobalAnalyzer(); IndexWriterConfig result = new IndexWriterConfig(PubMedIndexUtils.LUCENE_VERSION, analyzer); result.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); return result; } private static IndexWriter openIndexWriter(File indexPath) throws IOException { Directory dir = FSDirectory.open(indexPath); IndexWriterConfig config = getIndexWriterConfig(); return new IndexWriter(dir, config); } public static void main(String[] args) throws CLIOException, CorruptIndexException, IOException, ParserConfigurationException, SAXException { PubMedIndexUpdater inst = new PubMedIndexUpdater(); if (inst.parse(args)) { return; } if (inst.indexDir == null) { throw new CLIOException("missing index location"); } if (inst.sources.isEmpty()) { throw new CLIOException("missing source files location"); } inst.update(); } }