001//////////////////////////////////////////////////////////////////////////////// 002// checkstyle: Checks Java source code for adherence to a set of rules. 003// Copyright (C) 2001-2015 the original author or authors. 004// 005// This library is free software; you can redistribute it and/or 006// modify it under the terms of the GNU Lesser General Public 007// License as published by the Free Software Foundation; either 008// version 2.1 of the License, or (at your option) any later version. 009// 010// This library is distributed in the hope that it will be useful, 011// but WITHOUT ANY WARRANTY; without even the implied warranty of 012// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 013// Lesser General Public License for more details. 014// 015// You should have received a copy of the GNU Lesser General Public 016// License along with this library; if not, write to the Free Software 017// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 018//////////////////////////////////////////////////////////////////////////////// 019 020package com.puppycrawl.tools.checkstyle.checks; 021 022import java.util.regex.Matcher; 023import java.util.regex.Pattern; 024 025import com.puppycrawl.tools.checkstyle.api.Check; 026import com.puppycrawl.tools.checkstyle.api.DetailAST; 027import com.puppycrawl.tools.checkstyle.api.TokenTypes; 028 029/** 030 * <p> 031 * Restrict using <a href = 032 * "http://docs.oracle.com/javase/specs/jls/se8/html/jls-3.html#jls-3.3"> 033 * Unicode escapes</a> (e.g. \u221e). 034 * It is possible to allow using escapes for 035 * <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters"> 036 * non-printable(control) characters</a>. 037 * Also, this check can be configured to allow using escapes 038 * if trail comment is present. By the option it is possible to 039 * allow using escapes if literal contains only them. By the option it 040 * is possible to allow using escapes for space literals. 041 * </p> 042 * <p> 043 * Examples of using Unicode:</p> 044 * <pre> 045 * String unitAbbrev = "μs"; //Best: perfectly clear even without a comment. 046 * String unitAbbrev = "\u03bcs"; //Poor: the reader has no idea what this is. 047 * </pre> 048 * <p> 049 * An example of how to configure the check is: 050 * </p> 051 * <pre> 052 * <module name="AvoidEscapedUnicodeCharacters"/> 053 * </pre> 054 * <p> 055 * An example of non-printable(control) characters. 056 * </p> 057 * <pre> 058 * return '\ufeff' + content; // byte order mark 059 * </pre> 060 * <p> 061 * An example of how to configure the check to allow using escapes 062 * for non-printable(control) characters: 063 * </p> 064 * <pre> 065 * <module name="AvoidEscapedUnicodeCharacters"> 066 * <property name="allowEscapesForControlCharacters" value="true"/> 067 * </module> 068 * </pre> 069 * <p> 070 * Example of using escapes with trail comment: 071 * </p> 072 * <pre> 073 * String unitAbbrev = "\u03bcs"; // Greek letter mu, "s" 074 * </pre> 075 * <p>An example of how to configure the check to allow using escapes 076 * if trail comment is present: 077 * </p> 078 * <pre> 079 * <module name="AvoidEscapedUnicodeCharacters"> 080 * <property name="allowByTailComment" value="true"/> 081 * </module> 082 * </pre> 083 * <p>Example of using escapes if literal contains only them: 084 * </p> 085 * <pre> 086 * String unitAbbrev = "\u03bc\u03bc\u03bc"; 087 * </pre> 088 * <p>An example of how to configure the check to allow escapes 089 * if literal contains only them: 090 * </p> 091 * <pre> 092 * <module name="AvoidEscapedUnicodeCharacters"> 093 * <property name="allowIfAllCharactersEscaped" value="true"/> 094 * </module> 095 * </pre> 096 * <p>An example of how to configure the check to allow non-printable escapes: 097 * </p> 098 * <pre> 099 * <module name="AvoidEscapedUnicodeCharacters"> 100 * <property name="allowNonPrintableEscapes" value="true"/> 101 * </module> 102 * </pre> 103 * 104 * @author maxvetrenko 105 * 106 */ 107public class AvoidEscapedUnicodeCharactersCheck 108 extends Check { 109 /** Regular expression for Unicode chars. */ 110 private static final Pattern UNICODE_REGEXP = Pattern.compile("\\\\u[a-fA-F0-9]{4}"); 111 112 /** Regular expression Unicode control characters. */ 113 private static final Pattern UNICODE_CONTROL = Pattern.compile("\\\\(u|U)" 114 + "(00[0-1][0-1A-Fa-f]|00[8-9][0-9A-Fa-f]|034(f|F)|070(f|F)" 115 + "|180(e|E)|200[b-fB-F]|202[b-eB-E]|206[0-4a-fA-F]" 116 + "|[fF]{3}[9a-bA-B]|[fF][eE][fF]{2})"); 117 118 /** Regular expression for trail comment. */ 119 private static final Pattern COMMENT_REGEXP = Pattern.compile(";[ ]*//+" 120 + "[a-zA-Z0-9 ]*|;[ ]*/[*]+[a-zA-Z0-9 ]*"); 121 122 /** Regular expression for all escaped chars. */ 123 private static final Pattern ALL_ESCAPED_CHARS = 124 Pattern.compile("^((\\\\u)[a-fA-F0-9]{4}" 125 + "||\\\\b|\\\\t|\\\\n|\\\\f|\\\\r|\\\\|\"|\')+$"); 126 127 /** Regular expression for non-printable unicode chars. */ 128 private static final Pattern NON_PRINTABLE_CHARS = Pattern.compile("\\\\u1680|\\\\u2028" 129 + "|\\\\u2029|\\\\u205(f|F)|\\\\u3000|\\\\u2007|\\\\u2000|\\\\u200(a|A)" 130 + "|\\\\u007(F|f)|\\\\u009(f|F)|\\\\u(f|F){4}|\\\\u007(F|f)|\\\\u00(a|A)(d|D)" 131 + "|\\\\u0600|\\\\u061(c|C)|\\\\u06(d|D){2}|\\\\u070(f|F)|\\\\u1680|\\\\u180(e|E)" 132 + "|\\\\u2000|\\\\u2028|\\\\u205(f|F)|\\\\u2066|\\\\u2067|\\\\u2068|\\\\u2069" 133 + "|\\\\u206(a|A)|\\\\u(d|D)800|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){3}9" 134 + "|\\\\u(f|F){3}(a|A)|\\\\u0020|\\\\u00(a|A)0|\\\\u00(a|A)(d|D)|\\\\u0604" 135 + "|\\\\u061(c|C)|\\\\u06(d|D){2}|\\\\u070(f|F)|\\\\u1680|\\\\u180(e|E)|\\\\u200(f|F)" 136 + "|\\\\u202(f|F)|\\\\u2064|\\\\u2066|\\\\u2067|\\\\u2068|\\\\u2069|\\\\u206(f|F)" 137 + "|\\\\u(f|F)8(f|F){2}|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){3}9|\\\\u(f|F){3}(b|B)" 138 + "|\\\\u05(d|D)0|\\\\u05(f|F)3|\\\\u0600|\\\\u0750|\\\\u0(e|E)00|\\\\u1(e|E)00" 139 + "|\\\\u2100|\\\\u(f|F)(b|B)50|\\\\u(f|F)(e|E)70|\\\\u(F|f){2}61|\\\\u04(f|F)9" 140 + "|\\\\u05(b|B)(e|E)|\\\\u05(e|E)(a|A)|\\\\u05(f|F)4|\\\\u06(f|F){2}" 141 + "|\\\\u077(f|F)|\\\\u0(e|E)7(f|F)|\\\\u20(a|A)(f|F)|\\\\u213(a|A)|\\\\u0000" 142 + "|\\\\u(f|F)(d|D)(f|F){2}|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){2}(d|D)(c|C)" 143 + "|\\\\u2002|\\\\u0085|\\\\u200(a|A)|\\\\u2005|\\\\u2000|\\\\u2029|\\\\u000(B|b)" 144 + "|\\\\u2008|\\\\u2003|\\\\u205(f|F)|\\\\u1680|\\\\u0009|\\\\u0020|\\\\u2006" 145 + "|\\\\u2001|\\\\u202(f|F)|\\\\u00(a|A)0|\\\\u000(c|C)|\\\\u2009|\\\\u2004|\\\\u2028" 146 + "|\\\\u2028|\\\\u2007|\\\\u2004|\\\\u2028|\\\\u2007|\\\\u2025" 147 + "|\\\\u(f|F){2}0(e|E)|\\\\u(f|F){2}61"); 148 149 /** Allow use escapes for non-printable(control) characters. */ 150 private boolean allowEscapesForControlCharacters; 151 152 /** Allow use escapes if trail comment is present. */ 153 private boolean allowByTailComment; 154 155 /** Allow if all characters in literal are escaped. */ 156 private boolean allowIfAllCharactersEscaped; 157 158 /** Allow escapes for space literals. */ 159 private boolean allowNonPrintableEscapes; 160 161 /** 162 * Set allowIfAllCharactersEscaped. 163 * @param allow user's value. 164 */ 165 public final void setAllowEscapesForControlCharacters(boolean allow) { 166 allowEscapesForControlCharacters = allow; 167 } 168 169 /** 170 * Set allowByTailComment. 171 * @param allow user's value. 172 */ 173 public final void setAllowByTailComment(boolean allow) { 174 allowByTailComment = allow; 175 } 176 177 /** 178 * Set allowIfAllCharactersEscaped. 179 * @param allow user's value. 180 */ 181 public final void setAllowIfAllCharactersEscaped(boolean allow) { 182 allowIfAllCharactersEscaped = allow; 183 } 184 185 /** 186 * Set allowSpaceEscapes. 187 * @param allow user's value. 188 */ 189 public final void setAllowNonPrintableEscapes(boolean allow) { 190 allowNonPrintableEscapes = allow; 191 } 192 193 @Override 194 public int[] getDefaultTokens() { 195 return getAcceptableTokens(); 196 } 197 198 @Override 199 public int[] getAcceptableTokens() { 200 return new int[] {TokenTypes.STRING_LITERAL, TokenTypes.CHAR_LITERAL}; 201 } 202 203 @Override 204 public int[] getRequiredTokens() { 205 return getAcceptableTokens(); 206 } 207 208 @Override 209 public void visitToken(DetailAST ast) { 210 211 final String literal = ast.getText(); 212 213 if (hasUnicodeChar(literal) && !(allowByTailComment && hasTrailComment(ast) 214 || isAllCharactersEscaped(literal) 215 || allowEscapesForControlCharacters 216 && isOnlyUnicodeValidChars(literal, UNICODE_CONTROL) 217 || allowNonPrintableEscapes 218 && isOnlyUnicodeValidChars(literal, NON_PRINTABLE_CHARS))) { 219 log(ast.getLineNo(), "forbid.escaped.unicode.char"); 220 } 221 } 222 223 /** 224 * Checks if literal has Unicode chars. 225 * @param literal String literal. 226 * @return true if literal has Unicode chars. 227 */ 228 private static boolean hasUnicodeChar(String literal) { 229 return UNICODE_REGEXP.matcher(literal).find(); 230 } 231 232 /** 233 * Check if String literal contains Unicode control chars. 234 * @param literal String literal. 235 * @param pattern RegExp for valid characters. 236 * @return true, if String literal contains Unicode control chars. 237 */ 238 private static boolean isOnlyUnicodeValidChars(String literal, Pattern pattern) { 239 final int unicodeMatchesCounter = 240 countMatches(UNICODE_REGEXP, literal); 241 final int unicodeValidMatchesCounter = 242 countMatches(pattern, literal); 243 return unicodeMatchesCounter - unicodeValidMatchesCounter == 0; 244 } 245 246 /** 247 * Check if trail comment is present after ast token. 248 * @param ast current token. 249 * @return true if trail comment is present after ast token. 250 */ 251 private boolean hasTrailComment(DetailAST ast) { 252 final DetailAST variableDef = getVariableDef(ast); 253 DetailAST semi; 254 255 if (variableDef == null) { 256 semi = getSemi(ast); 257 } 258 else { 259 semi = variableDef.getNextSibling(); 260 261 if (semi.getType() != TokenTypes.SEMI) { 262 semi = variableDef.getLastChild(); 263 } 264 } 265 266 boolean result = false; 267 if (semi != null) { 268 final int lineNo = semi.getLineNo(); 269 final String currentLine = getLine(lineNo - 1); 270 271 if (COMMENT_REGEXP.matcher(currentLine).find()) { 272 result = true; 273 } 274 } 275 276 return result; 277 } 278 279 /** 280 * Count regexp matches into String literal. 281 * @param pattern pattern. 282 * @param target String literal. 283 * @return count of regexp matches. 284 */ 285 private static int countMatches(Pattern pattern, String target) { 286 int matcherCounter = 0; 287 final Matcher matcher = pattern.matcher(target); 288 while (matcher.find()) { 289 matcherCounter++; 290 } 291 return matcherCounter; 292 } 293 294 /** 295 * Get variable definition. 296 * @param ast current token. 297 * @return variable definition. 298 */ 299 private static DetailAST getVariableDef(DetailAST ast) { 300 DetailAST result = ast.getParent(); 301 while (result != null 302 && result.getType() != TokenTypes.VARIABLE_DEF) { 303 result = result.getParent(); 304 } 305 return result; 306 } 307 308 /** 309 * Get semi token. 310 * @param ast current token. 311 * @return semi token or null. 312 */ 313 private static DetailAST getSemi(DetailAST ast) { 314 DetailAST result = ast.getParent(); 315 while (result != null 316 && result.getLastChild().getType() != TokenTypes.SEMI) { 317 result = result.getParent(); 318 } 319 if (result != null) { 320 result = result.getLastChild(); 321 } 322 return result; 323 } 324 325 /** 326 * Checks if all characters in String literal is escaped. 327 * @param literal current literal. 328 * @return true if all characters in String literal is escaped. 329 */ 330 private boolean isAllCharactersEscaped(String literal) { 331 return allowIfAllCharactersEscaped 332 && ALL_ESCAPED_CHARS.matcher(literal.substring(1, 333 literal.length() - 1)).find(); 334 } 335}