blob: 946f6a07e958d9da81be1dbeefb5cfef84056aa9 [file] [log] [blame]
/*
* Copyright (C) 2025 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.android.textclassifier;
import static java.lang.String.format;
import android.icu.util.ULocale;
import android.util.ArrayMap;
import android.view.textclassifier.TextClassifier;
import android.view.textclassifier.TextLanguage;
import androidx.annotation.NonNull;
import androidx.annotation.Nullable;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Class with helper methods to detecting One-Time Password (OTP) codes in a text.
*
* <p>This class is designed to be lightweight with minimal dependencies, allowing it
* to be easily exported and built as a standalone library.
*/
public class OtpDetector {
private static final int PATTERN_FLAGS =
Pattern.DOTALL | Pattern.CASE_INSENSITIVE | Pattern.MULTILINE;
private static ThreadLocal<Matcher> compileToRegex(String pattern) {
return ThreadLocal.withInitial(() -> Pattern.compile(pattern, PATTERN_FLAGS).matcher(""));
}
private static final float TC_THRESHOLD = 0.6f;
private static final ArrayMap<String, ThreadLocal<Matcher>> EXTRA_LANG_OTP_REGEX =
new ArrayMap<>();
private static final ThreadLocal<Matcher> OTP_REGEX = compileToRegex(RegExStrings.ALL_OTP);
/**
* A combination of common false positives. These matches are expected to be longer than (or equal
* in length to) otp matches
*/
private static final ThreadLocal<Matcher> FALSE_POSITIVE_REGEX =
compileToRegex(RegExStrings.FALSE_POSITIVE);
/**
* Creates a regular expression to match any of a series of individual words, case insensitive. It
* also verifies the position of the word, relative to the OTP match
*/
private static ThreadLocal<Matcher> createDictionaryRegex(String[] words) {
StringBuilder regex = new StringBuilder("(");
for (int i = 0; i < words.length; i++) {
String boundedWord = "\\b" + words[i] + "\\b";
regex.append(boundedWord);
if (i != words.length - 1) {
regex.append("|");
}
}
regex.append(")");
return compileToRegex(regex.toString());
}
static {
EXTRA_LANG_OTP_REGEX.put(
ULocale.ENGLISH.toLanguageTag(), createDictionaryRegex(RegExStrings.englishContextWords));
}
/**
* Checks if a string of text might contain an OTP, based on several regular expressions, and
* potentially using a textClassifier to eliminate false positives.
*
* <p><b>Note:</b> This method is meant to be called in Android V only. Android B+ should make
* TextClassifier request to determine if the text contains OTP.</p>
*
* <p><b>Important:</b> Signature of this method to be kept intact since it is intended for
* use by external modules via an exported library.
*
* @param text The input text to scan for OTP keywords. Must not be null.
* @param tc TextClassifier instance to be used to find the language of the text.
* @return {@code true} if an OTP is determined to be in the text, {@code false} otherwise.
*/
public static boolean containsOtp(
@NonNull String text,
@NonNull TextClassifier tc) {
if (!containsOtpLikePattern(text)) {
return false;
}
TextLanguage language = getTextLanguage(text, tc);
return containsOtpWithLanguage(text, language);
}
/**
* Checks if the input text likely contains a language-specific keyword commonly associated with
* OTP, based on the provided language hint.
*
* <p>This method first attempts to determine a high-confidence {@link ULocale} corresponding to
* the given {@link TextLanguage}. If a reliable locale cannot be determined, it assumes no
* relevant OTP keyword is present for that language. Otherwise, it delegates to
* {@link #hasLanguageSpecificOtpWord} to perform the actual check using the language tag derived
* from the determined locale.
*
* @param text The input text to scan for OTP keywords. Must not be null.
* @param language The language hint for the input text, used to determine the appropriate locale
* for keyword matching. Must not be null.
* @return {@code true} if the text is determined to contain a language-specific OTP keyword
* matching the language hint, {@code false} otherwise (including cases where the language
* could not be confidently identified or no specific OTP keyword is found).
*/
protected static boolean containsOtpWithLanguage(@NonNull String text, @NonNull TextLanguage language) {
ULocale uLocale = getLanguageWithRegex(language);
if (uLocale == null) {
return false;
}
return hasLanguageSpecificOtpWord(text, uLocale.toLanguageTag());
}
/**
* Checks if the given text contains a pattern resembling an OTP.
*
* <p>This method attempts to identify such patterns by matching against a regular expression.
* Avoids false positives by checking for common patterns that might be mistaken for OTPs, such
* as phone numbers or dates.</p>
*
* @param text The text to be checked.
* @return {@code true} if the text contains an OTP-like pattern, {@code false} otherwise.
*/
protected static boolean containsOtpLikePattern(String text) {
Set<String> otpMatches = getAllMatches(text, OTP_REGEX.get());
if (otpMatches.isEmpty()) {
return false;
}
Set<String> falsePositives = getAllMatches(text, FALSE_POSITIVE_REGEX.get());
// This optional, but having this would help with performance
// Example: "Your OTP code is 1234 and this is sent on 01-01-2001"
// At this point -> otpMatches: [1234, 01-01-2001] falsePositives=[01-01-2001]
// It filters "01-01-2001" in advance and continues to next checks with otpMatches: [1234]
otpMatches.removeAll(falsePositives);
// Following is to handle text like: "Your OTP can't be shared at this point, please call
// (888) 888-8888"
// otpMatches: [888-8888] falsePositives=[(888) 888-8888] final=[]
for (String otpMatch : otpMatches) {
boolean currentOtpIsFalsePositive = false;
for (String falsePositive : falsePositives) {
if (falsePositive.contains(otpMatch)) {
currentOtpIsFalsePositive = true;
break;
}
}
if (!currentOtpIsFalsePositive) {
return true;
}
}
return false;
}
/**
* Checks if the given text contains a language-specific word or phrase associated with OTPs.
* This method uses regular expressions defined for specific languages to identify these words.
*
* @param text The text to check.
* @param languageTag The language tag (e.g., "en", "es", "fr") for which to check.
* @return {@code true} if the text contains a language-specific OTP word, {@code false} otherwise.
* Returns {@code false} if no language-specific regex is defined for the given tag.
*/
private static boolean hasLanguageSpecificOtpWord(@NonNull String text, @NonNull String languageTag) {
if (!EXTRA_LANG_OTP_REGEX.containsKey(languageTag)){
return false;
}
Matcher languageSpecificMatcher = EXTRA_LANG_OTP_REGEX.get(languageTag).get();
if (languageSpecificMatcher == null) {
return false;
}
languageSpecificMatcher.reset(text);
return languageSpecificMatcher.find();
}
private static Set<String> getAllMatches(String text, Matcher regex) {
Set<String> matches = new HashSet<>();
regex.reset(text);
while (regex.find()) {
matches.add(regex.group());
}
return matches;
}
// Tries to determine the language of the given text.
private static TextLanguage getTextLanguage(@NonNull String text, @NonNull TextClassifier tc) {
TextLanguage.Request langRequest = new TextLanguage.Request.Builder(text).build();
return tc.detectLanguage(langRequest);
}
// Will return the language with the highest confidence score that meets the minimum threshold,
// and has a language-specific regex, null otherwise
@Nullable
private static ULocale getLanguageWithRegex(@NonNull TextLanguage lang) {
float highestConfidence = 0;
ULocale highestConfidenceLocale = null;
for (int i = 0; i < lang.getLocaleHypothesisCount(); i++) {
ULocale locale = lang.getLocale(i);
float confidence = lang.getConfidenceScore(locale);
if (confidence >= TC_THRESHOLD
&& confidence >= highestConfidence
&& EXTRA_LANG_OTP_REGEX.containsKey(locale.toLanguageTag())) {
highestConfidence = confidence;
highestConfidenceLocale = locale;
}
}
return highestConfidenceLocale;
}
private OtpDetector() {}
private static class RegExStrings {
/*
* A regex matching a line start, open paren, arrow, colon (not proceeded by a digit), open square
* bracket, equals sign, double or single quote, ideographic char, or a space that is not preceded
* by a number. It will not consume the start char (meaning START won't be included in the matched
* string)
*/
private static final String START =
"(^|(?<=((^|[^0-9])\\s)|[>(\"'=\\[\\p{IsIdeographic}]|[^0-9]:))";
/*
* A regex matching a line end, a space that is not followed by a number, an ideographic char, or
* a period, close paren, close square bracket, single or double quote, exclamation point,
* question mark, or comma. It will not consume the end char
*/
private static final String END = "(?=\\s[^0-9]|$|\\p{IsIdeographic}|[.?!,)'\\]\"])";
private static final String ALL_OTP;
static {
/* One single OTP char. A number or alphabetical char (that isn't also ideographic) */
final String OTP_CHAR = "([0-9\\p{IsAlphabetic}&&[^\\p{IsIdeographic}]])";
/* One OTP char, followed by an optional dash */
final String OTP_CHAR_WITH_DASH = format("(%s-?)", OTP_CHAR);
/*
* Performs a lookahead to find a digit after 0 to 7 OTP_CHARs. This ensures that our potential
* OTP code contains at least one number
*/
final String FIND_DIGIT = format("(?=%s{0,7}\\d)", OTP_CHAR_WITH_DASH);
/*
* Matches between 5 and 8 otp chars, with dashes in between. Here, we are assuming an OTP code is
* 5-8 characters long. The last char must not be followed by a dash
*/
final String OTP_CHARS = format("(%s{4,7}%s)", OTP_CHAR_WITH_DASH, OTP_CHAR);
/* A regex matching four digit numerical codes */
final String FOUR_DIGITS = "(\\d{4})";
final String FIVE_TO_EIGHT_ALPHANUM_AT_LEAST_ONE_NUM =
format("(%s%s)", FIND_DIGIT, OTP_CHARS);
/* A regex matching two pairs of 3 digits (ex "123 456") */
final String SIX_DIGITS_WITH_SPACE = "(\\d{3}\\s\\d{3})";
/*
* Combining the regular expressions above, we get an OTP regex: 1. search for START, THEN 2.
* match ONE of a. alphanumeric sequence, at least one number, length 5-8, with optional dashes b.
* 4 numbers in a row c. pair of 3 digit codes separated by a space THEN 3. search for END Ex:
* "6454", " 345 678.", "[YDT-456]"
*/
ALL_OTP =
format(
"%s(%s|%s|%s)%s",
START, FIVE_TO_EIGHT_ALPHANUM_AT_LEAST_ONE_NUM, FOUR_DIGITS,
SIX_DIGITS_WITH_SPACE, END);
}
private static final String FALSE_POSITIVE;
static {
/*
* A Date regular expression. Looks for dates with the month, day, and year separated by dashes.
* Handles one and two digit months and days, and four or two-digit years. It makes the following
* assumptions: Dates and months will never be higher than 39 If a four digit year is used, the
* leading digit will be 1 or 2
*/
final String DATE_WITH_DASHES = "([0-3]?\\d-[0-3]?\\d-([12]\\d)?\\d\\d)";
/*
* matches a ten digit phone number, when the area code is separated by a space or dash. Supports
* optional parentheses around the area code, and an optional dash or space in between the rest of
* the numbers. This format registers as an otp match due to the space between the area code and
* the rest, but shouldn't.
*/
final String PHONE_WITH_SPACE = "(\\(?\\d{3}\\)?(-|\\s)?\\d{3}(-|\\s)?\\d{4})";
/*
* A combination of common false positives. These matches are expected to be longer than (or equal
* in length to) otp matches.
*/
FALSE_POSITIVE = format("%s(%s|%s)%s", START, DATE_WITH_DASHES, PHONE_WITH_SPACE, END);
}
/**
* A list of regular expressions representing words found in an OTP context (non case sensitive)
* Note: TAN is short for Transaction Authentication Number
*/
private static final String[] englishContextWords =
new String[] {
"pin",
"pass[-\\s]?(code|word)",
"TAN",
"otp",
"2fa",
"(two|2)[-\\s]?factor",
"log[-\\s]?in",
"auth(enticat(e|ion))?",
"code",
"secret",
"verif(y|ication)",
"one(\\s|-)?time",
"access",
"validat(e|ion)"
};
}
}