001////////////////////////////////////////////////////////////////////////////////
002// checkstyle: Checks Java source code for adherence to a set of rules.
003// Copyright (C) 2001-2022 the original author or authors.
004//
005// This library is free software; you can redistribute it and/or
006// modify it under the terms of the GNU Lesser General Public
007// License as published by the Free Software Foundation; either
008// version 2.1 of the License, or (at your option) any later version.
009//
010// This library is distributed in the hope that it will be useful,
011// but WITHOUT ANY WARRANTY; without even the implied warranty of
012// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
013// Lesser General Public License for more details.
014//
015// You should have received a copy of the GNU Lesser General Public
016// License along with this library; if not, write to the Free Software
017// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
018////////////////////////////////////////////////////////////////////////////////
019
020package com.puppycrawl.tools.checkstyle.checks;
021
022import java.util.Arrays;
023import java.util.List;
024import java.util.Map;
025import java.util.regex.Matcher;
026import java.util.regex.Pattern;
027
028import com.puppycrawl.tools.checkstyle.FileStatefulCheck;
029import com.puppycrawl.tools.checkstyle.api.AbstractCheck;
030import com.puppycrawl.tools.checkstyle.api.DetailAST;
031import com.puppycrawl.tools.checkstyle.api.TextBlock;
032import com.puppycrawl.tools.checkstyle.api.TokenTypes;
033import com.puppycrawl.tools.checkstyle.utils.CheckUtil;
034import com.puppycrawl.tools.checkstyle.utils.CodePointUtil;
035
036/**
037 * <p>
038 * Restricts using
039 * <a href = "https://docs.oracle.com/javase/specs/jls/se11/html/jls-3.html#jls-3.3">
040 * Unicode escapes</a>
041 * (such as &#92;u221e). It is possible to allow using escapes for
042 * <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters">
043 * non-printable, control characters</a>.
044 * Also, this check can be configured to allow using escapes
045 * if trail comment is present. By the option it is possible to
046 * allow using escapes if literal contains only them.
047 * </p>
048 * <ul>
049 * <li>
050 * Property {@code allowEscapesForControlCharacters} - Allow use escapes for
051 * non-printable, control characters.
052 * Type is {@code boolean}.
053 * Default value is {@code false}.
054 * </li>
055 * <li>
056 * Property {@code allowByTailComment} - Allow use escapes if trail comment is present.
057 * Type is {@code boolean}.
058 * Default value is {@code false}.
059 * </li>
060 * <li>
061 * Property {@code allowIfAllCharactersEscaped} - Allow if all characters in literal are escaped.
062 * Type is {@code boolean}.
063 * Default value is {@code false}.
064 * </li>
065 * <li>
066 * Property {@code allowNonPrintableEscapes} - Allow use escapes for
067 * non-printable, whitespace characters.
068 * Type is {@code boolean}.
069 * Default value is {@code false}.
070 * </li>
071 * </ul>
072 * <p>
073 * To configure the check:
074 * </p>
075 * <pre>
076 * &lt;module name="AvoidEscapedUnicodeCharacters"/&gt;
077 * </pre>
078 * <p>
079 * Examples of using Unicode:</p>
080 * <pre>
081 * String unitAbbrev = "μs";     // OK, perfectly clear even without a comment.
082 * String unitAbbrev = "&#92;u03bcs";// violation, the reader has no idea what this is.
083 * return '&#92;ufeff' + content;    // OK, an example of non-printable,
084 *                               // control characters (byte order mark).
085 * </pre>
086 * <p>
087 * An example of how to configure the check to allow using escapes
088 * for non-printable, control characters:
089 * </p>
090 * <pre>
091 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
092 *   &lt;property name="allowEscapesForControlCharacters" value="true"/&gt;
093 * &lt;/module&gt;
094 * </pre>
095 * <p>
096 * Example of using escapes for non-printable, control characters:
097 * </p>
098 * <pre>
099 * String unitAbbrev = "μs";      // OK, a normal String
100 * String unitAbbrev = "&#92;u03bcs"; // violation, "&#92;u03bcs" is a printable character.
101 * return '&#92;ufeff' + content;     // OK, non-printable control character.
102 * </pre>
103 * <p>
104 * An example of how to configure the check to allow using escapes
105 * if trail comment is present:
106 * </p>
107 * <pre>
108 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
109 *   &lt;property name="allowByTailComment" value="true"/&gt;
110 * &lt;/module&gt;
111 * </pre>
112 * <p>Example of using escapes if trail comment is present:
113 * </p>
114 * <pre>
115 * String unitAbbrev = "μs";      // OK, a normal String
116 * String unitAbbrev = "&#92;u03bcs"; // OK, Greek letter mu, "s"
117 * return '&#92;ufeff' + content;
118 * // -----^--------------------- violation, comment is not used within same line.
119 * </pre>
120 * <p>
121 * An example of how to configure the check to allow if
122 * all characters in literal are escaped.
123 * </p>
124 * <pre>
125 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
126 *   &lt;property name="allowIfAllCharactersEscaped" value="true"/&gt;
127 * &lt;/module&gt;
128 * </pre>
129 * <p>Example of using escapes if all characters in literal are escaped:</p>
130 * <pre>
131 * String unitAbbrev = "μs";      // OK, a normal String
132 * String unitAbbrev = "&#92;u03bcs"; // violation, not all characters are escaped ('s').
133 * String unitAbbrev = "&#92;u03bc&#92;u03bc&#92;u03bc"; // OK
134 * String unitAbbrev = "&#92;u03bc&#92;u03bcs";// violation, not all characters are escaped ('s').
135 * return '&#92;ufeff' + content;          // OK, all control characters are escaped
136 * </pre>
137 * <p>An example of how to configure the check to allow using escapes
138 * for non-printable whitespace characters:
139 * </p>
140 * <pre>
141 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
142 *   &lt;property name="allowNonPrintableEscapes" value="true"/&gt;
143 * &lt;/module&gt;
144 * </pre>
145 * <p>Example of using escapes for non-printable whitespace characters:</p>
146 * <pre>
147 * String unitAbbrev = "μs";       // OK, a normal String
148 * String unitAbbrev1 = "&#92;u03bcs"; // violation, printable escape character.
149 * String unitAbbrev2 = "&#92;u03bc&#92;u03bc&#92;u03bc"; // violation, printable escape character.
150 * String unitAbbrev3 = "&#92;u03bc&#92;u03bcs";// violation, printable escape character.
151 * return '&#92;ufeff' + content;           // OK, non-printable escape character.
152 * </pre>
153 * <p>
154 * Parent is {@code com.puppycrawl.tools.checkstyle.TreeWalker}
155 * </p>
156 * <p>
157 * Violation Message Keys:
158 * </p>
159 * <ul>
160 * <li>
161 * {@code forbid.escaped.unicode.char}
162 * </li>
163 * </ul>
164 *
165 * @since 5.8
166 */
167@FileStatefulCheck
168public class AvoidEscapedUnicodeCharactersCheck
169    extends AbstractCheck {
170
171    /**
172     * A key is pointing to the warning message text in "messages.properties"
173     * file.
174     */
175    public static final String MSG_KEY = "forbid.escaped.unicode.char";
176
177    /** Regular expression for Unicode chars. */
178    private static final Pattern UNICODE_REGEXP = Pattern.compile("\\\\u+[a-fA-F0-9]{4}");
179
180    /**
181     * Regular expression Unicode control characters.
182     *
183     * @see <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters">
184     *     Appendix:Control characters</a>
185     */
186    private static final Pattern UNICODE_CONTROL = Pattern.compile("\\\\u+"
187            + "(00[0-1][0-9A-Fa-f]"
188            + "|00[8-9][0-9A-Fa-f]"
189            + "|00[aA][dD]"
190            + "|034[fF]"
191            + "|070[fF]"
192            + "|180[eE]"
193            + "|200[b-fB-F]"
194            + "|202[a-eA-E]"
195            + "|206[0-4a-fA-F]"
196            + "|[fF]{3}[9a-bA-B]"
197            + "|[fF][eE][fF]{2})");
198
199    /**
200     * Regular expression for all escaped chars.
201     * See "EscapeSequence" at
202     * https://docs.oracle.com/javase/specs/jls/se15/html/jls-3.html#jls-3.10.7
203     */
204    private static final Pattern ALL_ESCAPED_CHARS = Pattern.compile("^("
205            + UNICODE_REGEXP.pattern()
206            + "|\""
207            + "|'"
208            + "|\\\\"
209            + "|\\\\b"
210            + "|\\\\f"
211            + "|\\\\n"
212            + "|\\R"
213            + "|\\\\r"
214            + "|\\\\s"
215            + "|\\\\t"
216            + ")+$");
217
218    /** Regular expression for escaped backslash. */
219    private static final Pattern ESCAPED_BACKSLASH = Pattern.compile("\\\\\\\\");
220
221    /** Regular expression for non-printable unicode chars. */
222    private static final Pattern NON_PRINTABLE_CHARS = Pattern.compile("\\\\u0000"
223            + "|\\\\u0009"
224            + "|\\\\u000[bB]"
225            + "|\\\\u000[cC]"
226            + "|\\\\u0020"
227            + "|\\\\u007[fF]"
228            + "|\\\\u0085"
229            + "|\\\\u009[fF]"
230            + "|\\\\u00[aA]0"
231            + "|\\\\u00[aA][dD]"
232            + "|\\\\u04[fF]9"
233            + "|\\\\u05[bB][eE]"
234            + "|\\\\u05[dD]0"
235            + "|\\\\u05[eE][aA]"
236            + "|\\\\u05[fF]3"
237            + "|\\\\u05[fF]4"
238            + "|\\\\u0600"
239            + "|\\\\u0604"
240            + "|\\\\u061[cC]"
241            + "|\\\\u06[dD]{2}"
242            + "|\\\\u06[fF]{2}"
243            + "|\\\\u070[fF]"
244            + "|\\\\u0750"
245            + "|\\\\u077[fF]"
246            + "|\\\\u0[eE]00"
247            + "|\\\\u0[eE]7[fF]"
248            + "|\\\\u1680"
249            + "|\\\\u180[eE]"
250            + "|\\\\u1[eE]00"
251            + "|\\\\u2000"
252            + "|\\\\u2001"
253            + "|\\\\u2002"
254            + "|\\\\u2003"
255            + "|\\\\u2004"
256            + "|\\\\u2005"
257            + "|\\\\u2006"
258            + "|\\\\u2007"
259            + "|\\\\u2008"
260            + "|\\\\u2009"
261            + "|\\\\u200[aA]"
262            + "|\\\\u200[fF]"
263            + "|\\\\u2025"
264            + "|\\\\u2028"
265            + "|\\\\u2029"
266            + "|\\\\u202[fF]"
267            + "|\\\\u205[fF]"
268            + "|\\\\u2064"
269            + "|\\\\u2066"
270            + "|\\\\u2067"
271            + "|\\\\u2068"
272            + "|\\\\u2069"
273            + "|\\\\u206[aA]"
274            + "|\\\\u206[fF]"
275            + "|\\\\u20[aA][fF]"
276            + "|\\\\u2100"
277            + "|\\\\u213[aA]"
278            + "|\\\\u3000"
279            + "|\\\\u[dD]800"
280            + "|\\\\u[fF]8[fF]{2}"
281            + "|\\\\u[fF][bB]50"
282            + "|\\\\u[fF][dD][fF]{2}"
283            + "|\\\\u[fF][eE]70"
284            + "|\\\\u[fF][eE][fF]{2}"
285            + "|\\\\u[fF]{2}0[eE]"
286            + "|\\\\u[fF]{2}61"
287            + "|\\\\u[fF]{2}[dD][cC]"
288            + "|\\\\u[fF]{3}9"
289            + "|\\\\u[fF]{3}[aA]"
290            + "|\\\\u[fF]{3}[bB]"
291            + "|\\\\u[fF]{4}");
292
293    /** Cpp style comments. */
294    private Map<Integer, TextBlock> singlelineComments;
295    /** C style comments. */
296    private Map<Integer, List<TextBlock>> blockComments;
297
298    /** Allow use escapes for non-printable, control characters. */
299    private boolean allowEscapesForControlCharacters;
300
301    /** Allow use escapes if trail comment is present. */
302    private boolean allowByTailComment;
303
304    /** Allow if all characters in literal are escaped. */
305    private boolean allowIfAllCharactersEscaped;
306
307    /** Allow use escapes for non-printable, whitespace characters. */
308    private boolean allowNonPrintableEscapes;
309
310    /**
311     * Setter to allow use escapes for non-printable, control characters.
312     *
313     * @param allow user's value.
314     */
315    public final void setAllowEscapesForControlCharacters(boolean allow) {
316        allowEscapesForControlCharacters = allow;
317    }
318
319    /**
320     * Setter to allow use escapes if trail comment is present.
321     *
322     * @param allow user's value.
323     */
324    public final void setAllowByTailComment(boolean allow) {
325        allowByTailComment = allow;
326    }
327
328    /**
329     * Setter to allow if all characters in literal are escaped.
330     *
331     * @param allow user's value.
332     */
333    public final void setAllowIfAllCharactersEscaped(boolean allow) {
334        allowIfAllCharactersEscaped = allow;
335    }
336
337    /**
338     * Setter to allow use escapes for non-printable, whitespace characters.
339     *
340     * @param allow user's value.
341     */
342    public final void setAllowNonPrintableEscapes(boolean allow) {
343        allowNonPrintableEscapes = allow;
344    }
345
346    @Override
347    public int[] getDefaultTokens() {
348        return getRequiredTokens();
349    }
350
351    @Override
352    public int[] getAcceptableTokens() {
353        return getRequiredTokens();
354    }
355
356    @Override
357    public int[] getRequiredTokens() {
358        return new int[] {
359            TokenTypes.STRING_LITERAL,
360            TokenTypes.CHAR_LITERAL,
361            TokenTypes.TEXT_BLOCK_CONTENT,
362        };
363    }
364
365    // suppress deprecation until https://github.com/checkstyle/checkstyle/issues/11166
366    @SuppressWarnings("deprecation")
367    @Override
368    public void beginTree(DetailAST rootAST) {
369        singlelineComments = getFileContents().getSingleLineComments();
370        blockComments = getFileContents().getBlockComments();
371    }
372
373    @Override
374    public void visitToken(DetailAST ast) {
375        final String literal =
376            CheckUtil.stripIndentAndInitialNewLineFromTextBlock(ast.getText());
377
378        if (hasUnicodeChar(literal) && !(allowByTailComment && hasTrailComment(ast)
379                || isAllCharactersEscaped(literal)
380                || allowEscapesForControlCharacters
381                        && isOnlyUnicodeValidChars(literal, UNICODE_CONTROL)
382                || allowNonPrintableEscapes
383                        && isOnlyUnicodeValidChars(literal, NON_PRINTABLE_CHARS))) {
384            log(ast, MSG_KEY);
385        }
386    }
387
388    /**
389     * Checks if literal has Unicode chars.
390     *
391     * @param literal String literal.
392     * @return true if literal has Unicode chars.
393     */
394    private static boolean hasUnicodeChar(String literal) {
395        final String literalWithoutEscapedBackslashes =
396                ESCAPED_BACKSLASH.matcher(literal).replaceAll("");
397        return UNICODE_REGEXP.matcher(literalWithoutEscapedBackslashes).find();
398    }
399
400    /**
401     * Check if String literal contains Unicode control chars.
402     *
403     * @param literal String literal.
404     * @param pattern RegExp for valid characters.
405     * @return true, if String literal contains Unicode control chars.
406     */
407    private static boolean isOnlyUnicodeValidChars(String literal, Pattern pattern) {
408        final int unicodeMatchesCounter =
409                countMatches(UNICODE_REGEXP, literal);
410        final int unicodeValidMatchesCounter =
411                countMatches(pattern, literal);
412        return unicodeMatchesCounter - unicodeValidMatchesCounter == 0;
413    }
414
415    /**
416     * Check if trail comment is present after ast token.
417     *
418     * @param ast current token.
419     * @return true if trail comment is present after ast token.
420     */
421    private boolean hasTrailComment(DetailAST ast) {
422        int lineNo = ast.getLineNo();
423
424        // Since the trailing comment in the case of text blocks must follow the """ delimiter,
425        // we need to look for it after TEXT_BLOCK_LITERAL_END.
426        if (ast.getType() == TokenTypes.TEXT_BLOCK_CONTENT) {
427            lineNo = ast.getNextSibling().getLineNo();
428        }
429        boolean result = false;
430        if (singlelineComments.containsKey(lineNo)) {
431            result = true;
432        }
433        else {
434            final List<TextBlock> commentList = blockComments.get(lineNo);
435            if (commentList != null) {
436                final TextBlock comment = commentList.get(commentList.size() - 1);
437                final int[] codePoints = getLineCodePoints(lineNo - 1);
438                result = isTrailingBlockComment(comment, codePoints);
439            }
440        }
441        return result;
442    }
443
444    /**
445     * Whether the C style comment is trailing.
446     *
447     * @param comment the comment to check.
448     * @param codePoints the first line of the comment, in unicode code points
449     * @return true if the comment is trailing.
450     */
451    private static boolean isTrailingBlockComment(TextBlock comment, int... codePoints) {
452        return comment.getText().length != 1
453            || CodePointUtil.isBlank(Arrays.copyOfRange(codePoints,
454                comment.getEndColNo() + 1, codePoints.length));
455    }
456
457    /**
458     * Count regexp matches into String literal.
459     *
460     * @param pattern pattern.
461     * @param target String literal.
462     * @return count of regexp matches.
463     */
464    private static int countMatches(Pattern pattern, String target) {
465        int matcherCounter = 0;
466        final Matcher matcher = pattern.matcher(target);
467        while (matcher.find()) {
468            matcherCounter++;
469        }
470        return matcherCounter;
471    }
472
473    /**
474     * Checks if all characters in String literal is escaped.
475     *
476     * @param literal current literal.
477     * @return true if all characters in String literal is escaped.
478     */
479    private boolean isAllCharactersEscaped(String literal) {
480        return allowIfAllCharactersEscaped
481                && ALL_ESCAPED_CHARS.matcher(literal).find();
482    }
483
484}