001//////////////////////////////////////////////////////////////////////////////// 002// checkstyle: Checks Java source code for adherence to a set of rules. 003// Copyright (C) 2001-2022 the original author or authors. 004// 005// This library is free software; you can redistribute it and/or 006// modify it under the terms of the GNU Lesser General Public 007// License as published by the Free Software Foundation; either 008// version 2.1 of the License, or (at your option) any later version. 009// 010// This library is distributed in the hope that it will be useful, 011// but WITHOUT ANY WARRANTY; without even the implied warranty of 012// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 013// Lesser General Public License for more details. 014// 015// You should have received a copy of the GNU Lesser General Public 016// License along with this library; if not, write to the Free Software 017// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 018//////////////////////////////////////////////////////////////////////////////// 019 020package com.puppycrawl.tools.checkstyle.checks; 021 022import java.util.Arrays; 023import java.util.List; 024import java.util.Map; 025import java.util.regex.Matcher; 026import java.util.regex.Pattern; 027 028import com.puppycrawl.tools.checkstyle.FileStatefulCheck; 029import com.puppycrawl.tools.checkstyle.api.AbstractCheck; 030import com.puppycrawl.tools.checkstyle.api.DetailAST; 031import com.puppycrawl.tools.checkstyle.api.TextBlock; 032import com.puppycrawl.tools.checkstyle.api.TokenTypes; 033import com.puppycrawl.tools.checkstyle.utils.CheckUtil; 034import com.puppycrawl.tools.checkstyle.utils.CodePointUtil; 035 036/** 037 * <p> 038 * Restricts using 039 * <a href = "https://docs.oracle.com/javase/specs/jls/se11/html/jls-3.html#jls-3.3"> 040 * Unicode escapes</a> 041 * (such as \u221e). It is possible to allow using escapes for 042 * <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters"> 043 * non-printable, control characters</a>. 044 * Also, this check can be configured to allow using escapes 045 * if trail comment is present. By the option it is possible to 046 * allow using escapes if literal contains only them. 047 * </p> 048 * <ul> 049 * <li> 050 * Property {@code allowEscapesForControlCharacters} - Allow use escapes for 051 * non-printable, control characters. 052 * Type is {@code boolean}. 053 * Default value is {@code false}. 054 * </li> 055 * <li> 056 * Property {@code allowByTailComment} - Allow use escapes if trail comment is present. 057 * Type is {@code boolean}. 058 * Default value is {@code false}. 059 * </li> 060 * <li> 061 * Property {@code allowIfAllCharactersEscaped} - Allow if all characters in literal are escaped. 062 * Type is {@code boolean}. 063 * Default value is {@code false}. 064 * </li> 065 * <li> 066 * Property {@code allowNonPrintableEscapes} - Allow use escapes for 067 * non-printable, whitespace characters. 068 * Type is {@code boolean}. 069 * Default value is {@code false}. 070 * </li> 071 * </ul> 072 * <p> 073 * To configure the check: 074 * </p> 075 * <pre> 076 * <module name="AvoidEscapedUnicodeCharacters"/> 077 * </pre> 078 * <p> 079 * Examples of using Unicode:</p> 080 * <pre> 081 * String unitAbbrev = "μs"; // OK, perfectly clear even without a comment. 082 * String unitAbbrev = "\u03bcs";// violation, the reader has no idea what this is. 083 * return '\ufeff' + content; // OK, an example of non-printable, 084 * // control characters (byte order mark). 085 * </pre> 086 * <p> 087 * An example of how to configure the check to allow using escapes 088 * for non-printable, control characters: 089 * </p> 090 * <pre> 091 * <module name="AvoidEscapedUnicodeCharacters"> 092 * <property name="allowEscapesForControlCharacters" value="true"/> 093 * </module> 094 * </pre> 095 * <p> 096 * Example of using escapes for non-printable, control characters: 097 * </p> 098 * <pre> 099 * String unitAbbrev = "μs"; // OK, a normal String 100 * String unitAbbrev = "\u03bcs"; // violation, "\u03bcs" is a printable character. 101 * return '\ufeff' + content; // OK, non-printable control character. 102 * </pre> 103 * <p> 104 * An example of how to configure the check to allow using escapes 105 * if trail comment is present: 106 * </p> 107 * <pre> 108 * <module name="AvoidEscapedUnicodeCharacters"> 109 * <property name="allowByTailComment" value="true"/> 110 * </module> 111 * </pre> 112 * <p>Example of using escapes if trail comment is present: 113 * </p> 114 * <pre> 115 * String unitAbbrev = "μs"; // OK, a normal String 116 * String unitAbbrev = "\u03bcs"; // OK, Greek letter mu, "s" 117 * return '\ufeff' + content; 118 * // -----^--------------------- violation, comment is not used within same line. 119 * </pre> 120 * <p> 121 * An example of how to configure the check to allow if 122 * all characters in literal are escaped. 123 * </p> 124 * <pre> 125 * <module name="AvoidEscapedUnicodeCharacters"> 126 * <property name="allowIfAllCharactersEscaped" value="true"/> 127 * </module> 128 * </pre> 129 * <p>Example of using escapes if all characters in literal are escaped:</p> 130 * <pre> 131 * String unitAbbrev = "μs"; // OK, a normal String 132 * String unitAbbrev = "\u03bcs"; // violation, not all characters are escaped ('s'). 133 * String unitAbbrev = "\u03bc\u03bc\u03bc"; // OK 134 * String unitAbbrev = "\u03bc\u03bcs";// violation, not all characters are escaped ('s'). 135 * return '\ufeff' + content; // OK, all control characters are escaped 136 * </pre> 137 * <p>An example of how to configure the check to allow using escapes 138 * for non-printable whitespace characters: 139 * </p> 140 * <pre> 141 * <module name="AvoidEscapedUnicodeCharacters"> 142 * <property name="allowNonPrintableEscapes" value="true"/> 143 * </module> 144 * </pre> 145 * <p>Example of using escapes for non-printable whitespace characters:</p> 146 * <pre> 147 * String unitAbbrev = "μs"; // OK, a normal String 148 * String unitAbbrev1 = "\u03bcs"; // violation, printable escape character. 149 * String unitAbbrev2 = "\u03bc\u03bc\u03bc"; // violation, printable escape character. 150 * String unitAbbrev3 = "\u03bc\u03bcs";// violation, printable escape character. 151 * return '\ufeff' + content; // OK, non-printable escape character. 152 * </pre> 153 * <p> 154 * Parent is {@code com.puppycrawl.tools.checkstyle.TreeWalker} 155 * </p> 156 * <p> 157 * Violation Message Keys: 158 * </p> 159 * <ul> 160 * <li> 161 * {@code forbid.escaped.unicode.char} 162 * </li> 163 * </ul> 164 * 165 * @since 5.8 166 */ 167@FileStatefulCheck 168public class AvoidEscapedUnicodeCharactersCheck 169 extends AbstractCheck { 170 171 /** 172 * A key is pointing to the warning message text in "messages.properties" 173 * file. 174 */ 175 public static final String MSG_KEY = "forbid.escaped.unicode.char"; 176 177 /** Regular expression for Unicode chars. */ 178 private static final Pattern UNICODE_REGEXP = Pattern.compile("\\\\u+[a-fA-F0-9]{4}"); 179 180 /** 181 * Regular expression Unicode control characters. 182 * 183 * @see <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters"> 184 * Appendix:Control characters</a> 185 */ 186 private static final Pattern UNICODE_CONTROL = Pattern.compile("\\\\u+" 187 + "(00[0-1][0-9A-Fa-f]" 188 + "|00[8-9][0-9A-Fa-f]" 189 + "|00[aA][dD]" 190 + "|034[fF]" 191 + "|070[fF]" 192 + "|180[eE]" 193 + "|200[b-fB-F]" 194 + "|202[a-eA-E]" 195 + "|206[0-4a-fA-F]" 196 + "|[fF]{3}[9a-bA-B]" 197 + "|[fF][eE][fF]{2})"); 198 199 /** 200 * Regular expression for all escaped chars. 201 * See "EscapeSequence" at 202 * https://docs.oracle.com/javase/specs/jls/se15/html/jls-3.html#jls-3.10.7 203 */ 204 private static final Pattern ALL_ESCAPED_CHARS = Pattern.compile("^(" 205 + UNICODE_REGEXP.pattern() 206 + "|\"" 207 + "|'" 208 + "|\\\\" 209 + "|\\\\b" 210 + "|\\\\f" 211 + "|\\\\n" 212 + "|\\R" 213 + "|\\\\r" 214 + "|\\\\s" 215 + "|\\\\t" 216 + ")+$"); 217 218 /** Regular expression for escaped backslash. */ 219 private static final Pattern ESCAPED_BACKSLASH = Pattern.compile("\\\\\\\\"); 220 221 /** Regular expression for non-printable unicode chars. */ 222 private static final Pattern NON_PRINTABLE_CHARS = Pattern.compile("\\\\u0000" 223 + "|\\\\u0009" 224 + "|\\\\u000[bB]" 225 + "|\\\\u000[cC]" 226 + "|\\\\u0020" 227 + "|\\\\u007[fF]" 228 + "|\\\\u0085" 229 + "|\\\\u009[fF]" 230 + "|\\\\u00[aA]0" 231 + "|\\\\u00[aA][dD]" 232 + "|\\\\u04[fF]9" 233 + "|\\\\u05[bB][eE]" 234 + "|\\\\u05[dD]0" 235 + "|\\\\u05[eE][aA]" 236 + "|\\\\u05[fF]3" 237 + "|\\\\u05[fF]4" 238 + "|\\\\u0600" 239 + "|\\\\u0604" 240 + "|\\\\u061[cC]" 241 + "|\\\\u06[dD]{2}" 242 + "|\\\\u06[fF]{2}" 243 + "|\\\\u070[fF]" 244 + "|\\\\u0750" 245 + "|\\\\u077[fF]" 246 + "|\\\\u0[eE]00" 247 + "|\\\\u0[eE]7[fF]" 248 + "|\\\\u1680" 249 + "|\\\\u180[eE]" 250 + "|\\\\u1[eE]00" 251 + "|\\\\u2000" 252 + "|\\\\u2001" 253 + "|\\\\u2002" 254 + "|\\\\u2003" 255 + "|\\\\u2004" 256 + "|\\\\u2005" 257 + "|\\\\u2006" 258 + "|\\\\u2007" 259 + "|\\\\u2008" 260 + "|\\\\u2009" 261 + "|\\\\u200[aA]" 262 + "|\\\\u200[fF]" 263 + "|\\\\u2025" 264 + "|\\\\u2028" 265 + "|\\\\u2029" 266 + "|\\\\u202[fF]" 267 + "|\\\\u205[fF]" 268 + "|\\\\u2064" 269 + "|\\\\u2066" 270 + "|\\\\u2067" 271 + "|\\\\u2068" 272 + "|\\\\u2069" 273 + "|\\\\u206[aA]" 274 + "|\\\\u206[fF]" 275 + "|\\\\u20[aA][fF]" 276 + "|\\\\u2100" 277 + "|\\\\u213[aA]" 278 + "|\\\\u3000" 279 + "|\\\\u[dD]800" 280 + "|\\\\u[fF]8[fF]{2}" 281 + "|\\\\u[fF][bB]50" 282 + "|\\\\u[fF][dD][fF]{2}" 283 + "|\\\\u[fF][eE]70" 284 + "|\\\\u[fF][eE][fF]{2}" 285 + "|\\\\u[fF]{2}0[eE]" 286 + "|\\\\u[fF]{2}61" 287 + "|\\\\u[fF]{2}[dD][cC]" 288 + "|\\\\u[fF]{3}9" 289 + "|\\\\u[fF]{3}[aA]" 290 + "|\\\\u[fF]{3}[bB]" 291 + "|\\\\u[fF]{4}"); 292 293 /** Cpp style comments. */ 294 private Map<Integer, TextBlock> singlelineComments; 295 /** C style comments. */ 296 private Map<Integer, List<TextBlock>> blockComments; 297 298 /** Allow use escapes for non-printable, control characters. */ 299 private boolean allowEscapesForControlCharacters; 300 301 /** Allow use escapes if trail comment is present. */ 302 private boolean allowByTailComment; 303 304 /** Allow if all characters in literal are escaped. */ 305 private boolean allowIfAllCharactersEscaped; 306 307 /** Allow use escapes for non-printable, whitespace characters. */ 308 private boolean allowNonPrintableEscapes; 309 310 /** 311 * Setter to allow use escapes for non-printable, control characters. 312 * 313 * @param allow user's value. 314 */ 315 public final void setAllowEscapesForControlCharacters(boolean allow) { 316 allowEscapesForControlCharacters = allow; 317 } 318 319 /** 320 * Setter to allow use escapes if trail comment is present. 321 * 322 * @param allow user's value. 323 */ 324 public final void setAllowByTailComment(boolean allow) { 325 allowByTailComment = allow; 326 } 327 328 /** 329 * Setter to allow if all characters in literal are escaped. 330 * 331 * @param allow user's value. 332 */ 333 public final void setAllowIfAllCharactersEscaped(boolean allow) { 334 allowIfAllCharactersEscaped = allow; 335 } 336 337 /** 338 * Setter to allow use escapes for non-printable, whitespace characters. 339 * 340 * @param allow user's value. 341 */ 342 public final void setAllowNonPrintableEscapes(boolean allow) { 343 allowNonPrintableEscapes = allow; 344 } 345 346 @Override 347 public int[] getDefaultTokens() { 348 return getRequiredTokens(); 349 } 350 351 @Override 352 public int[] getAcceptableTokens() { 353 return getRequiredTokens(); 354 } 355 356 @Override 357 public int[] getRequiredTokens() { 358 return new int[] { 359 TokenTypes.STRING_LITERAL, 360 TokenTypes.CHAR_LITERAL, 361 TokenTypes.TEXT_BLOCK_CONTENT, 362 }; 363 } 364 365 // suppress deprecation until https://github.com/checkstyle/checkstyle/issues/11166 366 @SuppressWarnings("deprecation") 367 @Override 368 public void beginTree(DetailAST rootAST) { 369 singlelineComments = getFileContents().getSingleLineComments(); 370 blockComments = getFileContents().getBlockComments(); 371 } 372 373 @Override 374 public void visitToken(DetailAST ast) { 375 final String literal = 376 CheckUtil.stripIndentAndInitialNewLineFromTextBlock(ast.getText()); 377 378 if (hasUnicodeChar(literal) && !(allowByTailComment && hasTrailComment(ast) 379 || isAllCharactersEscaped(literal) 380 || allowEscapesForControlCharacters 381 && isOnlyUnicodeValidChars(literal, UNICODE_CONTROL) 382 || allowNonPrintableEscapes 383 && isOnlyUnicodeValidChars(literal, NON_PRINTABLE_CHARS))) { 384 log(ast, MSG_KEY); 385 } 386 } 387 388 /** 389 * Checks if literal has Unicode chars. 390 * 391 * @param literal String literal. 392 * @return true if literal has Unicode chars. 393 */ 394 private static boolean hasUnicodeChar(String literal) { 395 final String literalWithoutEscapedBackslashes = 396 ESCAPED_BACKSLASH.matcher(literal).replaceAll(""); 397 return UNICODE_REGEXP.matcher(literalWithoutEscapedBackslashes).find(); 398 } 399 400 /** 401 * Check if String literal contains Unicode control chars. 402 * 403 * @param literal String literal. 404 * @param pattern RegExp for valid characters. 405 * @return true, if String literal contains Unicode control chars. 406 */ 407 private static boolean isOnlyUnicodeValidChars(String literal, Pattern pattern) { 408 final int unicodeMatchesCounter = 409 countMatches(UNICODE_REGEXP, literal); 410 final int unicodeValidMatchesCounter = 411 countMatches(pattern, literal); 412 return unicodeMatchesCounter - unicodeValidMatchesCounter == 0; 413 } 414 415 /** 416 * Check if trail comment is present after ast token. 417 * 418 * @param ast current token. 419 * @return true if trail comment is present after ast token. 420 */ 421 private boolean hasTrailComment(DetailAST ast) { 422 int lineNo = ast.getLineNo(); 423 424 // Since the trailing comment in the case of text blocks must follow the """ delimiter, 425 // we need to look for it after TEXT_BLOCK_LITERAL_END. 426 if (ast.getType() == TokenTypes.TEXT_BLOCK_CONTENT) { 427 lineNo = ast.getNextSibling().getLineNo(); 428 } 429 boolean result = false; 430 if (singlelineComments.containsKey(lineNo)) { 431 result = true; 432 } 433 else { 434 final List<TextBlock> commentList = blockComments.get(lineNo); 435 if (commentList != null) { 436 final TextBlock comment = commentList.get(commentList.size() - 1); 437 final int[] codePoints = getLineCodePoints(lineNo - 1); 438 result = isTrailingBlockComment(comment, codePoints); 439 } 440 } 441 return result; 442 } 443 444 /** 445 * Whether the C style comment is trailing. 446 * 447 * @param comment the comment to check. 448 * @param codePoints the first line of the comment, in unicode code points 449 * @return true if the comment is trailing. 450 */ 451 private static boolean isTrailingBlockComment(TextBlock comment, int... codePoints) { 452 return comment.getText().length != 1 453 || CodePointUtil.isBlank(Arrays.copyOfRange(codePoints, 454 comment.getEndColNo() + 1, codePoints.length)); 455 } 456 457 /** 458 * Count regexp matches into String literal. 459 * 460 * @param pattern pattern. 461 * @param target String literal. 462 * @return count of regexp matches. 463 */ 464 private static int countMatches(Pattern pattern, String target) { 465 int matcherCounter = 0; 466 final Matcher matcher = pattern.matcher(target); 467 while (matcher.find()) { 468 matcherCounter++; 469 } 470 return matcherCounter; 471 } 472 473 /** 474 * Checks if all characters in String literal is escaped. 475 * 476 * @param literal current literal. 477 * @return true if all characters in String literal is escaped. 478 */ 479 private boolean isAllCharactersEscaped(String literal) { 480 return allowIfAllCharactersEscaped 481 && ALL_ESCAPED_CHARS.matcher(literal).find(); 482 } 483 484}