001// License: GPL. For details, see LICENSE file.
002package org.openstreetmap.josm.data.validation.tests;
003
004import static java.util.regex.Pattern.CASE_INSENSITIVE;
005import static java.util.regex.Pattern.UNICODE_CASE;
006import static org.openstreetmap.josm.tools.I18n.tr;
007
008import java.awt.geom.Point2D;
009import java.util.ArrayList;
010import java.util.Arrays;
011import java.util.HashMap;
012import java.util.List;
013import java.util.Locale;
014import java.util.Map;
015import java.util.regex.Matcher;
016import java.util.regex.Pattern;
017
018import org.openstreetmap.josm.data.osm.OsmPrimitive;
019import org.openstreetmap.josm.data.osm.Way;
020import org.openstreetmap.josm.data.validation.Severity;
021import org.openstreetmap.josm.data.validation.Test;
022import org.openstreetmap.josm.data.validation.TestError;
023import org.openstreetmap.josm.data.validation.util.ValUtil;
024import org.openstreetmap.josm.gui.progress.ProgressMonitor;
025import org.openstreetmap.josm.tools.MultiMap;
026import org.openstreetmap.josm.tools.Utils;
027
028/**
029 * Checks for similar named ways, symptom of a possible typo. It uses the
030 * Levenshtein distance to check for similarity
031 *
032 * @author frsantos
033 */
034public class SimilarNamedWays extends Test {
035
036    protected static final int SIMILAR_NAMED = 701;
037
038    /** All ways, grouped by cells */
039    private Map<Point2D, List<Way>> cellWays;
040    /** The already detected errors */
041    private MultiMap<Way, Way> errorWays;
042
043    private final List<NormalizeRule> rules = new ArrayList<>();
044
045    /**
046     * Constructor
047     */
048    public SimilarNamedWays() {
049        super(tr("Similarly named ways"),
050                tr("This test checks for ways with similar names that may have been misspelled."));
051
052        // FIXME: hardcode these rules for now. Replace them with preferences later
053        // See https://josm.openstreetmap.de/ticket/3733#comment:19
054        addRegExprRule("\\pN+", "0"); // Unicode numbers: matches "Highway 66" but also persian numbers
055        addRegExprRule("M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", "0"); // Roman numbers: matches "Building II"
056        addRegExprRule("\\d+(st|nd|rd|th)", "0st"); // 3rd Ave
057        addRegExprRule("^[A-Z] ", "X"); // E Street
058        addSynonyms("east", "west", "north", "south");
059        addSynonyms("first", "second", "third");
060    }
061
062    @Override
063    public void startTest(ProgressMonitor monitor) {
064        super.startTest(monitor);
065        cellWays = new HashMap<>(1000);
066        errorWays = new MultiMap<>();
067    }
068
069    @Override
070    public void endTest() {
071        cellWays = null;
072        errorWays = null;
073        super.endTest();
074    }
075
076    @Override
077    public void visit(Way w) {
078        if (!w.isUsable())
079            return;
080
081        String name = w.get("name");
082        if (name == null || name.length() < 6)
083            return;
084
085        List<List<Way>> theCellWays = ValUtil.getWaysInCell(w, cellWays);
086        for (List<Way> ways : theCellWays) {
087            for (Way w2 : ways) {
088                if (errorWays.contains(w, w2) || errorWays.contains(w2, w)) {
089                    continue;
090                }
091
092                String name2 = w2.get("name");
093                if (name2 == null || name2.length() < 6) {
094                    continue;
095                }
096
097                if (similaryName(name, name2)) {
098                    List<OsmPrimitive> primitives = new ArrayList<>(2);
099                    primitives.add(w);
100                    primitives.add(w2);
101                    errors.add(TestError.builder(this, Severity.WARNING, SIMILAR_NAMED)
102                            .message(tr("Similarly named ways"))
103                            .primitives(primitives)
104                            .build());
105                    errorWays.put(w, w2);
106                }
107            }
108            ways.add(w);
109        }
110    }
111
112    /**
113     * Compute Levenshtein distance
114     *
115     * @param s First word
116     * @param t Second word
117     * @return The distance between words
118     * @deprecated Use {@link Utils#getLevenshteinDistance} instead
119     */
120    @Deprecated
121    public static int getLevenshteinDistance(String s, String t) {
122        return Utils.getLevenshteinDistance(s, t);
123    }
124
125    /**
126     * Add a regular expression rule.
127     * @param regExpr the regular expression to search for
128     * @param replacement a string to replace with, which should match the expression.
129     */
130    public void addRegExprRule(String regExpr, String replacement) {
131        rules.add(new RegExprRule(regExpr, replacement));
132    }
133
134    /**
135     * Add a rule with synonym words.
136     * @param words words which are synonyms
137     */
138    public void addSynonyms(String... words) {
139        for (String word : words) {
140            rules.add(new SynonymRule(word, words));
141        }
142    }
143
144    /**
145     * Check if two names are similar, but not identical. First both names will be "normalized".
146     * Afterwards the Levenshtein distance will be calculated.<br>
147     * Examples for normalization rules:<br>
148     * <code>replaceAll("\\d+", "0")</code><br>
149     * would cause similaryName("track 1", "track 2") = false, but similaryName("Track 1", "track 2") = true
150     * @param name first name to compare
151     * @param name2 second name to compare
152     * @return true if the normalized names are different but only a "little bit"
153     */
154    public boolean similaryName(String name, String name2) {
155        boolean similar = Utils.isSimilar(name, name2);
156
157        // try all rules
158        for (NormalizeRule rule : rules) {
159            int levenshteinDistance = Utils.getLevenshteinDistance(rule.normalize(name), rule.normalize(name2));
160            if (levenshteinDistance == 0)
161                // one rule results in identical names: identical
162                return false;
163            else if (levenshteinDistance <= 2) {
164                // 0 < distance <= 2
165                similar = true;
166            }
167        }
168        return similar;
169    }
170
171    /**
172     * A normalization that is applied to names before testing them
173     */
174    @FunctionalInterface
175    public interface NormalizeRule {
176
177        /**
178         * Normalize the string by replacing parts.
179         * @param name name to normalize
180         * @return normalized string
181         */
182        String normalize(String name);
183    }
184
185    /**
186     * A rule to replace by regular expression,
187     * so that all strings matching the regular expression are handled as if they were {@link RegExprRule#replacement}
188     */
189    public static class RegExprRule implements NormalizeRule {
190        private final Pattern regExpr;
191        private final String replacement;
192
193        /**
194         * Create a new rule to replace by regular expression
195         * @param expression The regular expression
196         * @param replacement The replacement
197         */
198        public RegExprRule(String expression, String replacement) {
199            this.regExpr = Pattern.compile(expression);
200            this.replacement = replacement;
201        }
202
203        @Override
204        public String normalize(String name) {
205            return regExpr.matcher(name).replaceAll(replacement);
206        }
207
208        @Override
209        public String toString() {
210            return "replaceAll(" + regExpr + ", " + replacement + ')';
211        }
212    }
213
214    /**
215     * A rule that registers synonyms to a given word
216     */
217    public static class SynonymRule implements NormalizeRule {
218
219        private final String[] words;
220        private final Pattern regExpr;
221        private final String replacement;
222
223        /**
224         * Create a new {@link SynonymRule}
225         * @param replacement The word to use instead
226         * @param words The synonyms for that word
227         */
228        public SynonymRule(String replacement, String... words) {
229            this.replacement = replacement.toLowerCase(Locale.ENGLISH);
230            this.words = words;
231
232            // build regular expression for other words (for fast match)
233            StringBuilder expression = new StringBuilder();
234            int maxLength = 0;
235            for (int i = 0; i < words.length; i++) {
236                if (words[i].length() > maxLength) {
237                    maxLength = words[i].length();
238                }
239                if (expression.length() > 0) {
240                    expression.append('|');
241                }
242                expression.append(Pattern.quote(words[i]));
243            }
244            this.regExpr = Pattern.compile(expression.toString(), CASE_INSENSITIVE + UNICODE_CASE);
245        }
246
247        @Override
248        public String normalize(String name) {
249            // find first match
250            Matcher matcher = regExpr.matcher(name);
251            if (!matcher.find())
252                return name;
253
254            int start = matcher.start();
255
256            // which word matches?
257            String part = "";
258            for (int i = 0; i < words.length; i++) {
259                String word = words[i];
260                part = name.substring(start, start + word.length());
261                if (word.equalsIgnoreCase(part)) {
262                    break;
263                }
264            }
265
266            // replace the word
267            char[] newName = matcher.replaceFirst(replacement).toCharArray();
268
269            // adjust case (replacement is not shorter than matching word!)
270            int minLength = Math.min(replacement.length(), part.length());
271            for (int i = 0; i < minLength; i++) {
272                if (Character.isUpperCase(part.charAt(i))) {
273                    newName[start + i] = Character.toUpperCase(newName[start + i]);
274                }
275            }
276
277            return new String(newName);
278        }
279
280        @Override
281        public String toString() {
282            return "synonyms(" + replacement + ", " + Arrays.toString(words) + ')';
283        }
284    }
285}