001// License: GPL. For details, see LICENSE file. 002package org.openstreetmap.josm.data.validation.tests; 003 004import static java.util.regex.Pattern.CASE_INSENSITIVE; 005import static java.util.regex.Pattern.UNICODE_CASE; 006import static org.openstreetmap.josm.tools.I18n.tr; 007 008import java.awt.geom.Point2D; 009import java.util.ArrayList; 010import java.util.Arrays; 011import java.util.HashMap; 012import java.util.List; 013import java.util.Locale; 014import java.util.Map; 015import java.util.regex.Matcher; 016import java.util.regex.Pattern; 017 018import org.openstreetmap.josm.data.osm.OsmPrimitive; 019import org.openstreetmap.josm.data.osm.Way; 020import org.openstreetmap.josm.data.validation.Severity; 021import org.openstreetmap.josm.data.validation.Test; 022import org.openstreetmap.josm.data.validation.TestError; 023import org.openstreetmap.josm.data.validation.util.ValUtil; 024import org.openstreetmap.josm.gui.progress.ProgressMonitor; 025import org.openstreetmap.josm.tools.MultiMap; 026import org.openstreetmap.josm.tools.Utils; 027 028/** 029 * Checks for similar named ways, symptom of a possible typo. It uses the 030 * Levenshtein distance to check for similarity 031 * 032 * @author frsantos 033 */ 034public class SimilarNamedWays extends Test { 035 036 protected static final int SIMILAR_NAMED = 701; 037 038 /** All ways, grouped by cells */ 039 private Map<Point2D, List<Way>> cellWays; 040 /** The already detected errors */ 041 private MultiMap<Way, Way> errorWays; 042 043 private final List<NormalizeRule> rules = new ArrayList<>(); 044 045 /** 046 * Constructor 047 */ 048 public SimilarNamedWays() { 049 super(tr("Similarly named ways"), 050 tr("This test checks for ways with similar names that may have been misspelled.")); 051 052 // FIXME: hardcode these rules for now. Replace them with preferences later 053 // See https://josm.openstreetmap.de/ticket/3733#comment:19 054 addRegExprRule("\\pN+", "0"); // Unicode numbers: matches "Highway 66" but also persian numbers 055 addRegExprRule("M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", "0"); // Roman numbers: matches "Building II" 056 addRegExprRule("\\d+(st|nd|rd|th)", "0st"); // 3rd Ave 057 addRegExprRule("^[A-Z] ", "X"); // E Street 058 addSynonyms("east", "west", "north", "south"); 059 addSynonyms("first", "second", "third"); 060 } 061 062 @Override 063 public void startTest(ProgressMonitor monitor) { 064 super.startTest(monitor); 065 cellWays = new HashMap<>(1000); 066 errorWays = new MultiMap<>(); 067 } 068 069 @Override 070 public void endTest() { 071 cellWays = null; 072 errorWays = null; 073 super.endTest(); 074 } 075 076 @Override 077 public void visit(Way w) { 078 if (!w.isUsable()) 079 return; 080 081 String name = w.get("name"); 082 if (name == null || name.length() < 6) 083 return; 084 085 List<List<Way>> theCellWays = ValUtil.getWaysInCell(w, cellWays); 086 for (List<Way> ways : theCellWays) { 087 for (Way w2 : ways) { 088 if (errorWays.contains(w, w2) || errorWays.contains(w2, w)) { 089 continue; 090 } 091 092 String name2 = w2.get("name"); 093 if (name2 == null || name2.length() < 6) { 094 continue; 095 } 096 097 if (similaryName(name, name2)) { 098 List<OsmPrimitive> primitives = new ArrayList<>(2); 099 primitives.add(w); 100 primitives.add(w2); 101 errors.add(TestError.builder(this, Severity.WARNING, SIMILAR_NAMED) 102 .message(tr("Similarly named ways")) 103 .primitives(primitives) 104 .build()); 105 errorWays.put(w, w2); 106 } 107 } 108 ways.add(w); 109 } 110 } 111 112 /** 113 * Compute Levenshtein distance 114 * 115 * @param s First word 116 * @param t Second word 117 * @return The distance between words 118 * @deprecated Use {@link Utils#getLevenshteinDistance} instead 119 */ 120 @Deprecated 121 public static int getLevenshteinDistance(String s, String t) { 122 return Utils.getLevenshteinDistance(s, t); 123 } 124 125 /** 126 * Add a regular expression rule. 127 * @param regExpr the regular expression to search for 128 * @param replacement a string to replace with, which should match the expression. 129 */ 130 public void addRegExprRule(String regExpr, String replacement) { 131 rules.add(new RegExprRule(regExpr, replacement)); 132 } 133 134 /** 135 * Add a rule with synonym words. 136 * @param words words which are synonyms 137 */ 138 public void addSynonyms(String... words) { 139 for (String word : words) { 140 rules.add(new SynonymRule(word, words)); 141 } 142 } 143 144 /** 145 * Check if two names are similar, but not identical. First both names will be "normalized". 146 * Afterwards the Levenshtein distance will be calculated.<br> 147 * Examples for normalization rules:<br> 148 * <code>replaceAll("\\d+", "0")</code><br> 149 * would cause similaryName("track 1", "track 2") = false, but similaryName("Track 1", "track 2") = true 150 * @param name first name to compare 151 * @param name2 second name to compare 152 * @return true if the normalized names are different but only a "little bit" 153 */ 154 public boolean similaryName(String name, String name2) { 155 boolean similar = Utils.isSimilar(name, name2); 156 157 // try all rules 158 for (NormalizeRule rule : rules) { 159 int levenshteinDistance = Utils.getLevenshteinDistance(rule.normalize(name), rule.normalize(name2)); 160 if (levenshteinDistance == 0) 161 // one rule results in identical names: identical 162 return false; 163 else if (levenshteinDistance <= 2) { 164 // 0 < distance <= 2 165 similar = true; 166 } 167 } 168 return similar; 169 } 170 171 /** 172 * A normalization that is applied to names before testing them 173 */ 174 @FunctionalInterface 175 public interface NormalizeRule { 176 177 /** 178 * Normalize the string by replacing parts. 179 * @param name name to normalize 180 * @return normalized string 181 */ 182 String normalize(String name); 183 } 184 185 /** 186 * A rule to replace by regular expression, 187 * so that all strings matching the regular expression are handled as if they were {@link RegExprRule#replacement} 188 */ 189 public static class RegExprRule implements NormalizeRule { 190 private final Pattern regExpr; 191 private final String replacement; 192 193 /** 194 * Create a new rule to replace by regular expression 195 * @param expression The regular expression 196 * @param replacement The replacement 197 */ 198 public RegExprRule(String expression, String replacement) { 199 this.regExpr = Pattern.compile(expression); 200 this.replacement = replacement; 201 } 202 203 @Override 204 public String normalize(String name) { 205 return regExpr.matcher(name).replaceAll(replacement); 206 } 207 208 @Override 209 public String toString() { 210 return "replaceAll(" + regExpr + ", " + replacement + ')'; 211 } 212 } 213 214 /** 215 * A rule that registers synonyms to a given word 216 */ 217 public static class SynonymRule implements NormalizeRule { 218 219 private final String[] words; 220 private final Pattern regExpr; 221 private final String replacement; 222 223 /** 224 * Create a new {@link SynonymRule} 225 * @param replacement The word to use instead 226 * @param words The synonyms for that word 227 */ 228 public SynonymRule(String replacement, String... words) { 229 this.replacement = replacement.toLowerCase(Locale.ENGLISH); 230 this.words = words; 231 232 // build regular expression for other words (for fast match) 233 StringBuilder expression = new StringBuilder(); 234 int maxLength = 0; 235 for (int i = 0; i < words.length; i++) { 236 if (words[i].length() > maxLength) { 237 maxLength = words[i].length(); 238 } 239 if (expression.length() > 0) { 240 expression.append('|'); 241 } 242 expression.append(Pattern.quote(words[i])); 243 } 244 this.regExpr = Pattern.compile(expression.toString(), CASE_INSENSITIVE + UNICODE_CASE); 245 } 246 247 @Override 248 public String normalize(String name) { 249 // find first match 250 Matcher matcher = regExpr.matcher(name); 251 if (!matcher.find()) 252 return name; 253 254 int start = matcher.start(); 255 256 // which word matches? 257 String part = ""; 258 for (int i = 0; i < words.length; i++) { 259 String word = words[i]; 260 part = name.substring(start, start + word.length()); 261 if (word.equalsIgnoreCase(part)) { 262 break; 263 } 264 } 265 266 // replace the word 267 char[] newName = matcher.replaceFirst(replacement).toCharArray(); 268 269 // adjust case (replacement is not shorter than matching word!) 270 int minLength = Math.min(replacement.length(), part.length()); 271 for (int i = 0; i < minLength; i++) { 272 if (Character.isUpperCase(part.charAt(i))) { 273 newName[start + i] = Character.toUpperCase(newName[start + i]); 274 } 275 } 276 277 return new String(newName); 278 } 279 280 @Override 281 public String toString() { 282 return "synonyms(" + replacement + ", " + Arrays.toString(words) + ')'; 283 } 284 } 285}