@@ -67,6 +67,15 @@ def _normalize_bounds(string, pos, endpos):
6767def _is_bytes_like (object ):
6868 return isinstance (object , (bytes , bytearray , memoryview , array , mmap ))
6969
70+ def _getlocale ():
71+ from locale import getlocale
72+ (lang , encoding ) = getlocale ()
73+ if lang is None and charset is None :
74+ return 'C'
75+ if lang is None :
76+ lang = 'en_US'
77+ return '.' .join ((lang , encoding ))
78+
7079def _new_compile (p , flags = 0 ):
7180 if _with_tregex and isinstance (p , (str , bytes , bytearray , memoryview , array , mmap )):
7281 return _t_compile (p , flags )
@@ -237,6 +246,7 @@ def __init__(self, pattern, flags):
237246 self .__binary = _is_bytes_like (pattern )
238247 self .pattern = pattern
239248 self .__input_flags = flags
249+ self .__locale_sensitive = self .__is_locale_sensitive (pattern , flags )
240250 flags_str = []
241251 for char , flag in FLAGS .items ():
242252 if flags & flag :
@@ -290,11 +300,19 @@ def __check_input_type(self, input):
290300 raise TypeError ("cannot use a bytes pattern on a string-like object" )
291301
292302 def __tregex_compile (self , method = "search" , must_advance = False ):
293- if (method , must_advance ) not in self .__compiled_regexes :
303+ if self .__locale_sensitive :
304+ key = (method , must_advance , _getlocale ())
305+ else :
306+ key = (method , must_advance )
307+ if key not in self .__compiled_regexes :
294308 try :
295- extra_options = f"PythonMethod={ method } ,MustAdvance={ 'true' if must_advance else 'false' } "
309+ if self .__locale_sensitive :
310+ locale_option = ",PythonLocale=" + key [2 ]
311+ else :
312+ locale_option = ""
313+ extra_options = f"PythonMethod={ method } ,MustAdvance={ 'true' if must_advance else 'false' } { locale_option } "
296314 compiled_regex = tregex_compile_internal (self .pattern , self .__flags_str , extra_options )
297- self .__compiled_regexes [( method , must_advance ) ] = compiled_regex
315+ self .__compiled_regexes [key ] = compiled_regex
298316 except ValueError as e :
299317 if len (e .args ) == 2 :
300318 msg = e .args [0 ]
@@ -307,7 +325,35 @@ def __tregex_compile(self, method="search", must_advance=False):
307325 raise ValueError (msg ) from None
308326 raise error (msg , self .pattern , e .args [1 ]) from None
309327 raise
310- return self .__compiled_regexes [(method , must_advance )]
328+ return self .__compiled_regexes [key ]
329+
330+ def __is_locale_sensitive (self , pattern , flags ):
331+ """Tests whether the regex is locale-sensitive. It is not completely precise. In some
332+ instances, it will return `True` even though the regex is *not* locale-sensitive. This is
333+ the case when sequences resembling inline flags appear in character classes or comments."""
334+ if not _is_bytes_like (pattern ):
335+ return False
336+ if flags & FLAG_LOCALE != 0 :
337+ return True
338+ pattern = pattern .decode (encoding = 'LATIN-1' )
339+ position = 0
340+ while position < len (pattern ):
341+ position = pattern .find ('(?' , position )
342+ if position == - 1 :
343+ break
344+ backslash_position = position - 1
345+ while backslash_position >= 0 and pattern [backslash_position ] == '\\ ' :
346+ backslash_position = backslash_position - 1
347+ # jump over '(?'
348+ position = position + 2
349+ if (position - backslash_position ) % 2 == 0 :
350+ # found odd number of backslashes, the parentheses is a literal
351+ continue
352+ while position < len (pattern ) and pattern [position ] in 'aiLmsux' :
353+ if pattern [position ] == 'L' :
354+ return True
355+ position = position + 1
356+ return False
311357
312358 def __fallback_compile (self ):
313359 if self .__compiled_fallback is None :
0 commit comments