import java.util.*; import java.io.*; import java.text.*; /** * Copyright (c) 2003, Regents of the University of California * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * **/ /** * The ExtractAbbrev class implements a simple algorithm for * extraction of abbreviations and their definitions from biomedical text. * Abbreviations (short forms) are extracted from the input file, and those abbreviations * for which a definition (long form) is found are printed out, along with that definition, * one per line. * * A file consisting of short-form/long-form pairs (tab separated) can be specified * in tandem with the -testlist option for the purposes of evaluating the algorithm. * * @see A Simple Algorithm for Identifying Abbreviation Definitions in Biomedical Text * A.S. Schwartz, M.A. Hearst; Pacific Symposium on Biocomputing 8:451-462(2003) * for a detailed description of the algorithm. * * @author Ariel Schwartz * @version 03/12/03 * * @updated 07/20/16 by Marti Hearst to include BSD License. */ public class ExtractAbbrev { HashMap mTestDefinitions = new HashMap(); HashMap mStats = new HashMap(); int truePositives = 0, falsePositives = 0, falseNegatives = 0, trueNegatives = 0; char delimiter = '\t'; boolean testMode = false; private boolean isValidShortForm(String str) { return (hasLetter(str) && (Character.isLetterOrDigit(str.charAt(0)) || (str.charAt(0) == '('))); } private boolean hasLetter(String str) { for (int i=0; i < str.length() ; i++) if (Character.isLetter(str.charAt(i))) return true; return false; } private boolean hasCapital(String str) { for (int i=0; i < str.length() ; i++) if (Character.isUpperCase(str.charAt(i))) return true; return false; } private void loadTrueDefinitions(String inFile) { String abbrString, defnString, str = ""; Vector entry; HashMap definitions = mTestDefinitions; try { BufferedReader fin = new BufferedReader(new FileReader (inFile)); while ((str = fin.readLine()) != null) { int j = str.indexOf(delimiter); abbrString = str.substring(0,j).trim(); defnString = str.substring(j,str.length()).trim(); entry = (Vector)definitions.get(abbrString); if (entry == null) entry = new Vector(); entry.add(defnString); definitions.put(abbrString, entry); } } catch (Exception e) { e.printStackTrace(); System.out.println(str); } } private boolean isTrueDefinition(String shortForm, String longForm) { Vector entry; Iterator itr; entry = (Vector)mTestDefinitions.get(shortForm); if (entry == null) return false; itr = entry.iterator(); while(itr.hasNext()){ if (itr.next().toString().equalsIgnoreCase(longForm)) return true; } return false; } private Vector extractAbbrPairs(String inFile) { String str, tmpStr, longForm = "", shortForm = ""; String currSentence = ""; int openParenIndex, closeParenIndex = -1, sentenceEnd, newCloseParenIndex, tmpIndex = -1; boolean newParagraph = true; StringTokenizer shortTokenizer; Vector candidates = new Vector(); try { BufferedReader fin = new BufferedReader(new FileReader (inFile)); while ((str = fin.readLine()) != null) { if (str.length() == 0 || newParagraph && ! Character.isUpperCase(str.charAt(0))){ currSentence = ""; newParagraph = true; continue; } newParagraph = false; str += " "; currSentence += str; openParenIndex = currSentence.indexOf(" ("); do { if (openParenIndex > -1) openParenIndex++; sentenceEnd = Math.max(currSentence.lastIndexOf(". "), currSentence.lastIndexOf(", ")); if ((openParenIndex == -1) && (sentenceEnd == -1)) { //Do nothing } else if (openParenIndex == -1) { currSentence = currSentence.substring(sentenceEnd + 2); } else if ((closeParenIndex = currSentence.indexOf(')',openParenIndex)) > -1){ sentenceEnd = Math.max(currSentence.lastIndexOf(". ", openParenIndex), currSentence.lastIndexOf(", ", openParenIndex)); if (sentenceEnd == -1) sentenceEnd = -2; longForm = currSentence.substring(sentenceEnd + 2, openParenIndex); shortForm = currSentence.substring(openParenIndex + 1, closeParenIndex); } if (shortForm.length() > 0 || longForm.length() > 0) { if (shortForm.length() > 1 && longForm.length() > 1) { if ((shortForm.indexOf('(') > -1) && ((newCloseParenIndex = currSentence.indexOf(')', closeParenIndex + 1)) > -1)){ shortForm = currSentence.substring(openParenIndex + 1, newCloseParenIndex); closeParenIndex = newCloseParenIndex; } if ((tmpIndex = shortForm.indexOf(", ")) > -1) shortForm = shortForm.substring(0, tmpIndex); if ((tmpIndex = shortForm.indexOf("; ")) > -1) shortForm = shortForm.substring(0, tmpIndex); shortTokenizer = new StringTokenizer(shortForm); if (shortTokenizer.countTokens() > 2 || shortForm.length() > longForm.length()) { // Long form in ( ) tmpIndex = currSentence.lastIndexOf(" ", openParenIndex - 2); tmpStr = currSentence.substring(tmpIndex + 1, openParenIndex - 1); longForm = shortForm; shortForm = tmpStr; if (! hasCapital(shortForm)) shortForm = ""; } if (isValidShortForm(shortForm)){ extractAbbrPair(shortForm.trim(), longForm.trim()); } } currSentence = currSentence.substring(closeParenIndex + 1); } else if (openParenIndex > -1) { if ((currSentence.length() - openParenIndex) > 200) // Matching close paren was not found currSentence = currSentence.substring(openParenIndex + 1); break; // Read next line } shortForm = ""; longForm = ""; } while ((openParenIndex = currSentence.indexOf(" (")) > -1); } fin.close(); } catch (Exception ioe) { ioe.printStackTrace(); System.out.println(currSentence); System.out.println(tmpIndex); } return candidates; } private String findBestLongForm(String shortForm, String longForm) { int sIndex; int lIndex; char currChar; sIndex = shortForm.length() - 1; lIndex = longForm.length() - 1; for ( ; sIndex >= 0; sIndex--) { currChar = Character.toLowerCase(shortForm.charAt(sIndex)); if (!Character.isLetterOrDigit(currChar)) continue; while (((lIndex >= 0) && (Character.toLowerCase(longForm.charAt(lIndex)) != currChar)) || ((sIndex == 0) && (lIndex > 0) && (Character.isLetterOrDigit(longForm.charAt(lIndex - 1))))) lIndex--; if (lIndex < 0) return null; lIndex--; } lIndex = longForm.lastIndexOf(" ", lIndex) + 1; return longForm.substring(lIndex); } private void extractAbbrPair(String shortForm, String longForm) { String bestLongForm; StringTokenizer tokenizer; int longFormSize, shortFormSize; if (shortForm.length() == 1) return; bestLongForm = findBestLongForm(shortForm, longForm); if (bestLongForm == null) return; tokenizer = new StringTokenizer(bestLongForm, " \t\n\r\f-"); longFormSize = tokenizer.countTokens(); shortFormSize = shortForm.length(); for (int i=shortFormSize - 1; i >= 0; i--) if (!Character.isLetterOrDigit(shortForm.charAt(i))) shortFormSize--; if (bestLongForm.length() < shortForm.length() || bestLongForm.indexOf(shortForm + " ") > -1 || bestLongForm.endsWith(shortForm) || longFormSize > shortFormSize * 2 || longFormSize > shortFormSize + 5 || shortFormSize > 10) return; if (testMode) { if (isTrueDefinition(shortForm, bestLongForm)) { System.out.println(shortForm + delimiter + bestLongForm + delimiter + "TP"); truePositives++; } else { falsePositives++; System.out.println(shortForm + delimiter + bestLongForm + delimiter + "FP"); } } else { System.out.println(shortForm + delimiter + bestLongForm); } } private static void usage() { System.err.println("Usage: ExtractAbbrev [-options] "); System.err.println(" contains text from which abbreviations are extracted" ); System.err.println(" -testlist = list of true abbreviation definition pairs"); System.err.println(" -usage or -help = this message"); System.exit(1); } public static void main(String[] args) { String shortForm, longForm, defnString, str; ExtractAbbrev extractAbbrev = new ExtractAbbrev(); Vector candidates; String[] candidate; String filename = null; String testList = null; //parse arguments for (int i = 0; i < args.length; i++) { if (args[i].equals("-testlist")) { if (i == args.length - 1) { usage(); } testList = args[++i]; extractAbbrev.testMode = true; } else if (args[i].equals("-usage")) { usage(); } else if (args[i].equals("-help")) { usage(); } else { filename = args[i]; // Must be last arg if (i != args.length - 1) { usage(); } } } if (filename == null) { usage(); } if (extractAbbrev.testMode) extractAbbrev.loadTrueDefinitions(testList); extractAbbrev.extractAbbrPairs(filename); if (extractAbbrev.testMode) System.out.println("TP: " + extractAbbrev.truePositives + " FP: " + extractAbbrev.falsePositives + " FN: " + extractAbbrev.falseNegatives + " TN: " + extractAbbrev.trueNegatives); } }