% \iffalse meta-comment % %% File: l3regex.dtx Copyright (C) 2011-2014 The LaTeX3 Project %% %% It may be distributed and/or modified under the conditions of the %% LaTeX Project Public License (LPPL), either version 1.3c of this %% license or (at your option) any later version. The latest version %% of this license is in the file %% %% http://www.latex-project.org/lppl.txt %% %% This file is part of the "l3experimental bundle" (The Work in LPPL) %% and all files in that bundle must be distributed together. %% %% The released version of this bundle is available from CTAN. %% %% ----------------------------------------------------------------------- %% %% The development version of the bundle can be found at %% %% http://www.latex-project.org/svnroot/experimental/trunk/ %% %% for those people who are interested. %% %%%%%%%%%%% %% NOTE: %% %%%%%%%%%%% %% %% Snapshots taken from the repository represent work in progress and may %% not work or may contain conflicting material! We therefore ask %% people _not_ to put them into distributions, archives, etc. without %% prior consultation with the LaTeX3 Project. %% %% ----------------------------------------------------------------------- % %<*driver|package> % The version of expl3 required is tested as early as possible, as % some really old versions do not define \ProvidesExplPackage. \RequirePackage{expl3}[2014/11/25] %\@ifpackagelater{expl3}{2014/11/25} % {} % {% % \PackageError{l3regex}{Support package l3kernel too old} % {% % Please install an up to date version of l3kernel\MessageBreak % using your TeX package manager or from CTAN.\MessageBreak % \MessageBreak % Loading l3regex will abort!% % }% % \endinput % } \GetIdInfo$Id: l3regex.dtx 5471 2014-11-25 20:11:29Z joseph $ {L3 Experimental regular expressions} % %<*driver> \documentclass[full]{l3doc} \usepackage{amsmath} \begin{document} \DocInput{\jobname.dtx} \end{document} % % \fi % % \title{^^A % The \textsf{l3regex} package: regular expressions in \TeX{}^^A % \thanks{This file describes v\ExplFileVersion, % last revised \ExplFileDate.}^^A % } % % \author{^^A % The \LaTeX3 Project\thanks % {^^A % E-mail: % \href{mailto:latex-team@latex-project.org} % {latex-team@latex-project.org}^^A % }^^A % } % % \date{Released \ExplFileDate} % % \maketitle % % \begin{documentation} % \newenvironment{l3regex-syntax} % {\begin{itemize}\def\\{\char`\\}\def\makelabel##1{\hss\llap{\ttfamily##1}}} % {\end{itemize}} % % \section{\pkg{l3regex} documentation} % % The \pkg{l3regex} package provides regular expression testing, % extraction of submatches, splitting, and replacement, all acting % on token lists. The syntax of regular expressions is mostly a subset % of the \textsc{pcre} syntax (and very close to \textsc{posix}), % with some additions % due to the fact that \TeX{} manipulates tokens rather than characters. % For performance reasons, only a limited set of features are implemented. % Notably, back-references are not supported. % % Let us give a few examples. After % \begin{verbatim} % \tl_set:Nn \l_my_tl { That~cat. } % \regex_replace_once:nnN { at } { is } \l_my_tl % \end{verbatim} % the token list variable \cs{l_my_tl} holds the text % \enquote{\texttt{This cat.}}, where the first % occurrence of \enquote{\texttt{at}} was replaced % by \enquote{\texttt{is}}. A more complicated example is % a pattern to add a comma at the end of each word: % \begin{verbatim} % \regex_replace_all:nnN { \w+ } { \0 , } \l_my_tl % \end{verbatim} % The |\w| sequence represents any \enquote{word} character, % and |+| indicates that the |\w| sequence should be repeated % as many times as possible (at least once), hence matching a word in the % input token list. In the replacement text, |\0| denotes the full match % (here, a word). % % If a regular expression is to be used several times, % it can be compiled once, and stored in a regex % variable using \cs{regex_const:Nn}. For example, % \begin{verbatim} % \regex_const:Nn \c_foo_regex { \c{begin} \cB. (\c[^BE].*) \cE. } % \end{verbatim} % stores in \cs{c_foo_regex} a regular expression which matches the % starting marker for an environment: \cs{begin}, followed by a % begin-group token (|\cB.|), then any number of tokens which are % neither begin-group nor end-group character tokens (|\c[^BE].*|), % ending with an end-group token (|\cE.|). As explained in the next % section, the parentheses \enquote{capture} the result of |\c[^BE].*|, % giving us access to the name of the environment when doing % replacements. % % \subsection{Syntax of regular expressions} % % Most characters match exactly themselves, % with an arbitrary category code. Some characters are % special and must be escaped with a backslash (\emph{e.g.}, |\*| % matches a star character). Some escape sequences of % the form backslash--letter also have a special meaning % (for instance |\d| matches any digit). As a rule, % \begin{itemize} % \item every alphanumeric character (\texttt{A}--\texttt{Z}, % \texttt{a}--\texttt{z}, \texttt{0}--\texttt{9}) matches % exactly itself, and should not be escaped, because % |\A|, |\B|, \ldots{} have special meanings; % \item non-alphanumeric printable ascii characters can (and should) % always be escaped: many of them have special meanings (\emph{e.g.}, % use |\(|, |\)|, |\?|, |\.|); % \item spaces should always be escaped (even in character % classes); % \item any other character may be escaped or not, without any % effect: both versions will match exactly that character. % \end{itemize} % Note that these rules play nicely with the fact that many % non-alphanumeric characters are difficult to input into \TeX{} % under normal category codes. For instance, |\\abc\%| % matches the characters |\abc%| (with arbitrary category codes), % but does not match the control sequence |\abc| followed by a % percent character. Matching control sequences can be done % using the |\c|\Arg{regex} syntax (see below). % % Any special character which appears at a place where its special % behaviour cannot apply matches itself instead (for instance, a % quantifier appearing at the beginning of a string), after raising a % warning. % % Characters. % \begin{l3regex-syntax} % \item[\\x\{hh\ldots{}\}] Character with hex code \texttt{hh\ldots{}} % \item[\\xhh] Character with hex code \texttt{hh}. % \item[\\a] Alarm (hex 07). % \item[\\e] Escape (hex 1B). % \item[\\f] Form-feed (hex 0C). % \item[\\n] New line (hex 0A). % \item[\\r] Carriage return (hex 0D). % \item[\\t] Horizontal tab (hex 09). % \end{l3regex-syntax} % % Character types. % \begin{l3regex-syntax} % \item[.] A single period matches any token. % \item[\\d] Any decimal digit. % \item[\\h] Any horizontal space character, % equivalent to |[\ \^^I]|: space and tab. % \item[\\s] Any space character, % equivalent to |[\ \^^I\^^J\^^L\^^M]|. % \item[\\v] Any vertical space character, % equivalent to |[\^^J\^^K\^^L\^^M]|. Note that |\^^K| is a vertical space, % but not a space, for compatibility with Perl. % \item[\\w] Any word character, \emph{i.e.}, % alpha-numerics and underscore, equivalent to |[A-Za-z0-9\_]|. % \item[\\D] Any token not matched by |\d|. % \item[\\H] Any token not matched by |\h|. % \item[\\N] Any token other than the |\n| character (hex 0A). % \item[\\S] Any token not matched by |\s|. % \item[\\V] Any token not matched by |\v|. % \item[\\W] Any token not matched by |\w|. % \end{l3regex-syntax} % Of those, |.|, |\D|, |\H|, |\N|, |\S|, |\V|, and |\W| will match arbitrary % control sequences. % % Character classes match exactly one token in the subject. % \begin{l3regex-syntax} % \item[{[\ldots{}]}] Positive character class. % Matches any of the specified tokens. % \item[{[\char`\^\ldots{}]}] Negative character class. % Matches any token other than the specified characters. % \item[{x-y}] Within a character class, this denotes a range (can be % used with escaped characters). % \item[{[:\meta{name}:]}] Within a character class (one more set of % brackets), this denotes the \textsc{posix} character class % \meta{name}, which can be \texttt{alnum}, \texttt{alpha}, % \texttt{ascii}, \texttt{blank}, \texttt{cntrl}, \texttt{digit}, % \texttt{graph}, \texttt{lower}, \texttt{print}, \texttt{punct}, % \texttt{space}, \texttt{upper}, \texttt{word}, or \texttt{xdigit}. % \item[{[:\char`\^\meta{name}:]}] Negative \textsc{posix} character class. % \end{l3regex-syntax} % For instance, |[a-oq-z\cC.]| matches any lowercase latin letter % except |p|, as well as control sequences (see below for a description % of |\c|). % % Quantifiers (repetition). % \begin{l3regex-syntax} % \item[?] $0$ or $1$, greedy. % \item[??] $0$ or $1$, lazy. % \item[*] $0$ or more, greedy. % \item[*?] $0$ or more, lazy. % \item[+] $1$ or more, greedy. % \item[+?] $1$ or more, lazy. % \item[\{$n$\}] Exactly $n$. % \item[\{$n,$\}] $n$ or more, greedy. % \item[\{$n,$\}?] $n$ or more, lazy. % \item[\{$n,m$\}] At least $n$, no more than $m$, greedy. % \item[\{$n,m$\}?] At least $n$, no more than $m$, lazy. % \end{l3regex-syntax} % % Anchors and simple assertions. % \begin{l3regex-syntax} % \item[\\b] Word boundary: either the previous token is matched by % |\w| and the next by |\W|, or the opposite. For this purpose, % the ends of the token list are considered as |\W|. % \item[\\B] Not a word boundary: between two |\w| tokens % or two |\W| tokens (including the boundary). % \item[\char`^ \textrm{or} \\A] % Start of the subject token list. % \item[\char`$\textrm{,} \\Z \textrm{or} \\z] % End of the subject token list. % \item[\\G] Start of the current match. This is only different from |^| % in the case of multiple matches: for instance % |\regex_count:nnN { \G a } { aaba } \l_tmpa_int| yields $2$, but % replacing |\G| by |^| would result in \cs{l_tmpa_int} holding the % value $1$. % \end{l3regex-syntax} % % Alternation and capturing groups. % \begin{l3regex-syntax} % \item[A\char`|B\char`|C] Either one of \texttt{A}, \texttt{B}, % or \texttt{C}. % \item[(\ldots{})] Capturing group. % \item[(?:\ldots{})] Non-capturing group. % \item[(?\char`|\ldots{})] Non-capturing group which resets % the group number for capturing groups in each alternative. % The following group will be numbered with the first unused % group number. % \end{l3regex-syntax} % % The |\c| escape sequence allows to test the category code of tokens, % and match control sequences. Each character category is represented % by a single uppercase letter: % \begin{itemize} % \item |C| for control sequences; % \item |B| for begin-group tokens; % \item |E| for end-group tokens; % \item |M| for math shift; % \item |T| for alignment tab tokens; % \item |P| for macro parameter tokens; % \item |U| for superscript tokens (up); % \item |D| for subscript tokens (down); % \item |S| for spaces; % \item |L| for letters; % \item |O| for others; and % \item |A| for active characters. % \end{itemize} % The |\c| escape sequence is used as follows. % \begin{l3regex-syntax} % \item[\\c\Arg{regex}] A control sequence whose csname matches the % \meta{regex}, anchored at the beginning and end, so that |\c{begin}| % matches exactly \cs{begin}, and nothing else. % \item[\\cX] Applies to the next object, which can be a character, % character property, class, or group, and forces this object to % only match tokens with category |X| (any of |CBEMTPUDSLOA|. For % instance, |\cL[A-Z\d]| matches uppercase letters and digits of % category code letter, |\cC.| matches any control sequence, and % |\cO(abc)| matches |abc| where each character has category other. % \item[{\\c[XYZ]}] Applies to the next object, and forces it to only % match tokens with category |X|, |Y|, or |Z| (each being any of % |CBEMTPUDSLOA|). For instance, |\c[LSO](..)| matches two tokens of % category letter, space, or other. % \item[{\\c[\char`\^XYZ]}] Applies to the next object and prevents it % from matching any token with category |X|, |Y|, or |Z| (each being % any of |CBEMTPUDSLOA|). For instance, |\c[^O]\d| matches digits % which have any category different from other. % \end{l3regex-syntax} % The category code tests can be used inside classes; for instance, % |[\cO\d \c[LO][A-F]]| matches what \TeX{} considers as hexadecimal % digits, namely digits with category other, or uppercase letters from % |A| to |F| with category either letter or other. Within a group % affected by a category code test, the outer test can be overridden by % a nested test: for instance, |\cL(ab\cO\*cd)| matches |ab*cd| where % all characters are of category letter, except |*| which has category % other. % % The |\u| escape sequence allows to insert the contents of a token list % directly into a regular expression or a replacement, avoiding the need % to escape special characters. Namely, |\u|\Arg{tl~var~name} matches % the exact contents of the token list \meta{tl~var}. Within a |\c{...}| % control sequence matching, the |\u| escape sequence only expands its % argument once, in effect performing \cs{tl_to_str:v}. Quantifiers are % not supported directly: use a group. % % The option |(?i)| makes the match case insensitive (identifying % \texttt{A}--\texttt{Z} with \texttt{a}--\texttt{z}; no Unicode support % yet). This applies until the end of the group in which it appears, and % can be reverted using |(?-i)|. For instance, in % \verb"(?i)(a(?-i)b|c)d", the letters |a| and |d| are affected by the % |i| option. Characters within ranges and classes are affected % individually: |(?i)[Y-\\]| is equivalent to |[YZ\[\\yz]|, and % |(?i)[^aeiou]| matches any character which is not a vowel. Neither % character properties, nor |\c{...}| nor |\u{...}| are affected by the % |i| option. % ^^A \] % % In character classes, only |[|, |^|, |-|, |]|, |\| and spaces are % special, and should be escaped. Other non-alphanumeric characters can % still be escaped without harm. Any escape sequence which matches a % single character (|\d|, |\D|, \emph{etc.}) is supported in character % classes. If the first character is |^|, then % the meaning of the character class is inverted; |^| appearing anywhere % else in the range is not special. If the first character (possibly % following a leading |^|) is |]| then it does not need to be escaped % since ending the range there would make it empty. % Ranges of characters % can be expressed using |-|, for instance, |[\D 0-5]| and |[^6-9]| are % equivalent. % % Capturing groups are a means of extracting information about the % match. Parenthesized groups are labelled in the order of their % opening parenthesis, starting at $1$. The contents of those groups % corresponding to the \enquote{best} match (leftmost longest) % can be extracted and stored in a sequence of token lists using for % instance \cs{regex_extract_once:nnNTF}. % % The |\K| escape sequence resets the beginning of the match to the % current position in the token list. This only affects what is reported % as the full match. For instance, % \begin{verbatim} % \regex_extract_all:nnN { a \K . } { a123aaxyz } \l_foo_seq % \end{verbatim} % results in \cs{l_foo_seq} containing the items |{1}| and |{a}|: the % true matches are |{a1}| and |{aa}|, but they are trimmed by the use of % |\K|. The |\K| command does not affect capturing groups: for instance, % \begin{verbatim} % \regex_extract_once:nnN { (. \K c)+ \d } { acbc3 } \l_foo_seq % \end{verbatim} % results in \cs{l_foo_seq} containing the items |{c3}| and |{bc}|: the % true match is |{acbc3}|, with first submatch |{bc}|, but |\K| resets % the beginning of the match to the last position where it appears. % % \subsection{Syntax of the replacement text} % % Most of the features described in regular expressions do not make % sense within the replacement text. Escaped characters are supported % as inside regular expressions. The whole match is accessed as~|\0|, % and the first~$9$ submatches are accessed as |\1|, \ldots{},~|\9|. % Further submatches are accessed through |\g{|\meta{number}|}| where % \meta{number} is any non-negative integer. If there are fewer than % \meta{number} capturing groups, the submatch is empty. % % For instance, % \begin{verbatim} % \tl_set:Nn \l_my_tl { Hello,~world! } % \regex_replace_all:nnN { ([er]?l|o) . } { \(\0\-\-\1\) } \l_my_tl % \end{verbatim} % results in \cs{l_my_tl} holding |H(ell--el)(o,--o) w(or--o)(ld--l)!| % % Submatches keep the same category codes as in the original token list. % The characters inserted by the replacement have category code $12$ % (other) by default, with the exception of space characters. Spaces % inserted through \verb*|\ | have category code $10$, while spaces % inserted through |\x20| or |\x{20}| have category code $12$. % The escape sequence |\c| allows to insert characters % with arbitrary category codes, as well as control sequences. % \begin{l3regex-syntax} % \item[\\cXY] Produces the character~|Y| (which can be given as an % escape sequence such as~|\t| for tab, or |\(| or~|\)| for a % parenthesis) with category code~|X|, which must be one of % |CBEMTPUDSLOA|. % \item[\\c\Arg{text}] Produces the control sequence with csname % \meta{text}. The \meta{text} may contain references to the % submatches |\0|, |\1|, \emph{etc.} % \end{l3regex-syntax} % % The escape sequence |\u|\Arg{tl~var~name} allows to insert the % contents of the token list with name \meta{tl~var~name} directly into % the replacement, avoiding the need to escape special characters. % Within the construction |\c|\Arg{text}, the |\u|~escape sequence only % expands its argument once, in effect performing \cs{tl_to_str:v}. % Submatches can be used within the argument of |\u|. For instance, % \begin{verbatim} % \tl_set:Nn \l_my_one_tl { first } % \tl_set:Nn \l_my_two_tl { \emph{second} } % \tl_set:Nn \l_my_tl { one , two , one , one } % \regex_replace_all:nnN { [^,]+ } { \u{l_my_\0_tl} } \l_my_tl % \end{verbatim} % results in \cs{l_my_tl} holding |first,\emph{second},first,first|. % % \subsection{Pre-compiling regular expressions} % % If a regular expression is to be used several times, % it is better to compile it once rather than doing it % each time the regular expression is used. The compiled % regular expression is stored in a variable. All % of the \pkg{l3regex} module's functions can be given their % regular expression argument either as an explicit string % or as a compiled regular expression. % % \begin{function}{\regex_new:N} % \begin{syntax} % \cs{regex_new:N} \meta{regex~var} % \end{syntax} % Creates a new \meta{regex~var} or raises an error if the % name is already taken. The declaration is global. The % \meta{regex~var} will initially be such that it never matches. % \end{function} % % \begin{function}{\regex_set:Nn, \regex_gset:Nn, \regex_const:Nn} % \begin{syntax} % \cs{regex_set:Nn} \meta{regex~var} \Arg{regex} % \end{syntax} % Stores a compiled version of the \meta{regular expression} % in the \meta{regex~var}. For instance, this function can be used % as % \begin{verbatim} % \regex_new:N \l_my_regex % \regex_set:Nn \l_my_regex { my\ (simple\ )? reg(ex|ular\ expression) } % \end{verbatim} % The assignment is local for \cs{regex_set:Nn} and global for % \cs{regex_gset:Nn}. Use \cs{regex_const:Nn} for compiled expressions % which will never change. % \end{function} % % \begin{function}{\regex_show:n, \regex_show:N} % \begin{syntax} % \cs{regex_show:n} \Arg{regex} % \end{syntax} % Shows how \pkg{l3regex} interprets the \meta{regex}. For instance, % \cs{regex_show:n} \verb+{\A X|Y}+ shows % \begin{verbatim} % +-branch % anchor at start (\A) % char code 88 % +-branch % char code 89 % \end{verbatim} % indicating that the anchor |\A| only applies to the first branch: % the second branch is not anchored to the beginning of the match. % \end{function} % % \subsection{Matching} % % All regular expression functions are available in both |:n| and |:N| % variants. The former require a \enquote{standard} regular expression, % while the later require a compiled expression as generated by % \cs{regex_(g)set:Nn}. % % \begin{function}[TF]{\regex_match:nn, \regex_match:Nn} % \begin{syntax} % \cs{regex_match:nnTF} \Arg{regex} \Arg{token list} \Arg{true code} \Arg{false code} % \end{syntax} % Tests whether the \meta{regular expression} matches any part % of the \meta{token list}. For instance, % \begin{verbatim} % \regex_match:nnTF { b [cde]* } { abecdcx } { TRUE } { FALSE } % \regex_match:nnTF { [b-dq-w] } { example } { TRUE } { FALSE } % \end{verbatim} % leaves \texttt{TRUE} then \texttt{FALSE} in the input stream. % \end{function} % % \begin{function}{\regex_count:nnN, \regex_count:NnN} % \begin{syntax} % \cs{regex_count:nnN} \Arg{regex} \Arg{token list} \meta{int var} % \end{syntax} % Sets \meta{int var} within the current \TeX{} group level % equal to the number of times % \meta{regular expression} appears in \meta{token list}. % The search starts by finding the left-most longest match, % respecting greedy and ungreedy operators. Then the search % starts again from the character following the last character % of the previous match, until reaching the end of the token list. % Infinite loops are prevented in the case where the regular expression % can match an empty token list: then we count one match between each % pair of characters. % For instance, % \begin{verbatim} % \int_new:N \l_foo_int % \regex_count:nnN { (b+|c) } { abbababcbb } \l_foo_int % \end{verbatim} % results in \cs{l_foo_int} taking the value $5$. % \end{function} % % \subsection{Submatch extraction} % % \begin{function}[TF]{\regex_extract_once:nnN, \regex_extract_once:NnN} % \begin{syntax} % \cs{regex_extract_once:nnN} \Arg{regex} \Arg{token list} \meta{seq~var} % \cs{regex_extract_once:nnNTF} \Arg{regex} \Arg{token list} \meta{seq~var} \Arg{true code} \Arg{false code} % \end{syntax} % Finds the first match of the \meta{regular expression} % in the \meta{token list}. If it exists, the match is stored % as the zeroeth item of the \meta{seq~var}, and further % items are the contents of capturing groups, in the order % of their opening parenthesis. The \meta{seq~var} % is assigned locally. If there is no match, % the \meta{seq~var} is cleared. % The testing versions insert the \meta{true code} into the input % stream if a match was found, and the \meta{false code} otherwise. % For instance, assume that you type % \begin{verbatim} % \regex_extract_once:nnNTF { \A(La)?TeX(!*)\Z } { LaTeX!!! } \l_foo_seq % { true } { false } % \end{verbatim} % Then the regular expression (anchored at the start with |\A| and % at the end with |\Z|) will match the whole token list. The first % capturing group, |(La)?|, matches |La|, and the second capturing % group, |(!*)|, matches |!!!|. Thus, |\l_foo_seq| will contain % the items |{LaTeX!!!}|, |{La}|, and |{!!!}|, and the \texttt{true} % branch is left in the input stream. % \end{function} % % \begin{function}[TF]{\regex_extract_all:nnN, \regex_extract_all:NnN} % \begin{syntax} % \cs{regex_extract_all:nnN} \Arg{regex} \Arg{token list} \meta{seq~var} % \cs{regex_extract_all:nnNTF} \Arg{regex} \Arg{token list} \meta{seq~var} \Arg{true code} \Arg{false code} % \end{syntax} % Finds all matches of the \meta{regular expression} % in the \meta{token list}, and stores all the submatch information % in a single sequence (concatenating the results of % multiple \cs{regex_extract_once:nnN} calls). % The \meta{seq~var} is assigned locally. If there is no match, % the \meta{seq~var} is cleared. % The testing versions insert the \meta{true code} into the input % stream if a match was found, and the \meta{false code} otherwise. % For instance, assume that you type % \begin{verbatim} % \regex_extract_all:nnNTF { \w+ } { Hello,~world! } \l_foo_seq % { true } { false } % \end{verbatim} % Then the regular expression will match twice, and the resulting % sequence contains the two items |{Hello}| and |{world}|, % and the \texttt{true} branch is left in the input stream. % \end{function} % % \begin{function}[TF]{\regex_split:nnN, \regex_split:NnN} % \begin{syntax} % \cs{regex_split:nnN} \Arg{regular expression} \Arg{token list} \meta{seq~var} % \cs{regex_split:nnNTF} \Arg{regular expression} \Arg{token list} \meta{seq~var} \Arg{true code} \Arg{false code} % \end{syntax} % Splits the \meta{token list} into a sequence of parts, delimited by % matches of the \meta{regular expression}. If the \meta{regular expression} % has capturing groups, then the token lists that they match are stored as % items of the sequence as well. The assignment to \meta{seq~var} is local. % If no match is found the resulting \meta{seq~var} has the % \meta{token list} as its sole item. If the \meta{regular expression} % matches the empty token list, then the \meta{token list} is split % into single tokens. % The testing versions insert the \meta{true code} into the input % stream if a match was found, and the \meta{false code} otherwise. % For example, after % \begin{verbatim} % \seq_new:N \l_path_seq % \regex_split:nnNTF { / } { the/path/for/this/file.tex } \l_path_seq % { true } { false } % \end{verbatim} % the sequence |\l_path_seq| contains the items |{the}|, |{path}|, % |{for}|, |{this}|, and |{file.tex}|, and the \texttt{true} branch % is left in the input stream. % \end{function} % % \subsection{Replacement} % % \begin{function}[TF]{\regex_replace_once:nnN,\regex_replace_once:NnN} % \begin{syntax} % \cs{regex_replace_once:nnN} \Arg{regular expression} \Arg{replacement} \meta{tl~var} % \cs{regex_replace_once:nnNTF} \Arg{regular expression} \Arg{replacement} \meta{tl~var} \Arg{true code} \Arg{false code} % \end{syntax} % Searches for the \meta{regular expression} in the \meta{token list} % and replaces the first match with the \meta{replacement}. The result % is assigned locally to \meta{tl~var}. In the \meta{replacement}, % |\0| represents the full match, |\1| represent the contents of the % first capturing group, |\2| of the second, \emph{etc.} % \end{function} % % \begin{function}[TF]{\regex_replace_all:nnN, \regex_replace_all:NnN} % \begin{syntax} % \cs{regex_replace_all:nnN} \Arg{regular expression} \Arg{replacement} \meta{tl~var} % \cs{regex_replace_all:nnNTF} \Arg{regular expression} \Arg{replacement} \meta{tl~var} \Arg{true code} \Arg{false code} % \end{syntax} % Replaces all occurrences of the \cs{regular expression} in the % \meta{token list} by the \meta{replacement}, where |\0| represents % the full match, |\1| represent the contents of the first capturing % group, |\2| of the second, \emph{etc.} Every match is treated % independently, and matches cannot overlap. The result is assigned % locally to \meta{tl~var}. % \end{function} % % \subsection{Bugs, misfeatures, future work, and other possibilities} % % The following need to be done now. % \begin{itemize} % \item Change user function names! % \item Clean up the use of messages. % \item Rewrite the documentation in a more ordered way, perhaps add a % \textsc{bnf}? % \end{itemize} % % Additional error-checking to come. % \begin{itemize} % \item Currently, |a{\x34}| is recognized as |a{4}|. % \item Cleaner error reporting in the replacement phase. % \item Add tracing information. % \item Detect attempts to use back-references. % \item Test for the maximum register \cs{c_max_register_int}. % \item Find out whether the fact that |\W| and friends match the % end-marker leads to bugs. Possibly update \cs{__regex_item_reverse:n}. % \item Enforce that |\cC| can only be followed by a match-all dot. % \item The empty cs should be matched by |\c{}|, not by % |\c{csname.?endcsname\s?}|. % \end{itemize} % % Code improvements to come. % \begin{itemize} % \item Change \tn{skip} to \tn{dimen} for the array of active % threads, and shift the array of submatch informations so that it % starts at \tn{skip}$0$. % \item Optimize |\c{abc}| for matching a specific control sequence. % \item Only build \c{...} once. % \item Use \tn{skip} for the left and right state stacks when % compiling a regex. % \item Should \cs{__regex_action_free_group:n} only be used for greedy % |{n,}| quantifier? (I think not.) % \item Quantifiers for |\u| and assertions. % \item Improve digit grabbing for the |\g| escape in replacement. % Allow arbitrary integer expressions for all those numbers? % \item When matching, keep track of an explicit stack of % \texttt{current_state} and \texttt{current_submatches}. % \item If possible, when a state is reused by the same thread, kill % other subthreads. % \item Use \tn{dimen} registers rather than \cs{l__regex_balance_tl} % to build \cs{__regex_replacement_balance_one_match:n}. % \item Reduce the number of epsilon-transitions in alternatives. % \item Optimize simple strings: use less states (|abcade| should give % two states, for |abc| and |ade|). [Does that really make sense?] % \item Optimize groups with no alternative. % \item Optimize states with a single \cs{__regex_action_free:n}. % \item Optimize the use of \cs{__regex_action_success:} by inserting it % in state $2$ directly instead of having an extra transition. % \item Optimize the use of \cs{int_step_...} functions. % \item Groups don't capture within regexes for csnames; optimize and % document. % \item Decide and document what |\c{\c{...}}| should do in the % replacement text, similar questions for |\u|. % \item Better \enquote{show} for anchors, properties, and catcode tests. % \item Does |\K| really need a new state for itself? % \item When compiling, use a boolean \texttt{in_cs} and less magic % numbers. % \item Instead of checking whether the character is special or % alphanumeric using its character code, check if it is special in % regexes with \cs{cs_if_exist} tests. % \end{itemize} % % The following features are likely to be implemented at some point % in the future. % \begin{itemize} % \item Allow |\cL(abc)| in replacement text. % \item General look-ahead/behind assertions. % \item Regex matching on external files. % \item Conditional subpatterns with look ahead/behind: \enquote{if % what follows is [\ldots{}], then [\ldots{}]}. % \item |(*..)| and |(?..)| sequences to set some options. % \item UTF-8 mode for pdf\TeX{}. % \item Newline conventions are not done. % In particular, we should have an option for |.| not to match newlines. % Also, |\A| should differ from |^|, and |\Z|, |\z| and |$| should % differ. % \item Unicode properties: |\p{..}| and |\P{..}|; % |\X| which should match any \enquote{extended} Unicode sequence. % This requires to manipulate a lot of data, probably using tree-boxes. % \end{itemize} % % The following features of \textsc{pcre} or Perl will probably not be % implemented. % \begin{itemize} % \item |\ddd|, matching the character with octal code \texttt{ddd}; % \item Callout with |(?C...)|, we cannot run arbitrary user code % during the matching, because the regex code uses registers in an % unsafe way; % \item Conditional subpatterns (other than with a look-ahead or % look-behind condition): this is non-regular, isn't it? % \item Named subpatterns: \TeX{} programmers have lived so far % without any need for named macro parameters. % \end{itemize} % % The following features of \textsc{pcre} or Perl will definitely not be % implemented. % \begin{itemize} % \item |\cx|, similar to \TeX{}'s own |\^^x|; % \item Comments: \TeX{} already has its own system for comments. % \item |\Q...\E| escaping: this would require to read the argument % verbatim, which is not in the scope of this module. % \item Atomic grouping, possessive quantifiers: those tools, mostly % meant to fix catastrophic backtracking, are unnecessary in a % non-backtracking algorithm, and difficult to implement. % \item Subroutine calls: this syntactic sugar is difficult to include % in a non-backtracking algorithm, in particular because the % corresponding group should be treated as atomic. Also, we cannot % afford to run user code within the regular expression matching, % because of our \enquote{misuse} of registers. % \item Recursion: this is a non-regular feature. % \item Back-references: non-regular feature, this requires % backtracking, which is prohibitively slow. % \item Backtracking control verbs: intrinsically tied to % backtracking. % \item |\C| single byte in UTF-8 mode: Xe\TeX{} and Lua\TeX{} serve % us characters directly, and splitting those into bytes is tricky, % encoding dependent, and most likely not useful anyways. % \end{itemize} % % \end{documentation} % % \begin{implementation} % % \section{\pkg{l3regex} implementation} % % \begin{macrocode} %<*initex|package> % \end{macrocode} % % \begin{macrocode} %<@@=regex> % \end{macrocode} % % \begin{macrocode} %<*package> \ProvidesExplPackage {\ExplFileName}{\ExplFileDate}{\ExplFileVersion}{\ExplFileDescription} \RequirePackage{l3tl-build, l3tl-analysis, l3flag, l3str, l3str-convert} % % \end{macrocode} % % \subsection{Plan of attack} % % Most regex engines use backtracking. This allows to provide very % powerful features (back-references come to mind first), but it is % costly, and raises the problem of catastrophic backtracking. Since % \TeX{} is not first and foremost a programming language, complicated % code tends to run slowly, and we must use faster, albeit slightly more % restrictive, techniques, coming from automata theory. % % Given a regular expression of $n$ characters, we do the following: % \begin{itemize} % \item (Compiling.) Analyse the regex, finding invalid input, and % convert it to an internal representation. % \item (Building.) Convert the compiled regex to a non-deterministic % finite automaton (\textsc{nfa}) with roughly $n$ states which % accepts precisely token lists matching that regex. % \item (Matching.) Loop through the query token list one token (one % \enquote{position}) at a time, exploring in parallel every % possible path (\enquote{active thread}) through the \textsc{nfa}, % considering active threads in an order determined by the % quantifiers' greediness. % \end{itemize} % % We use the following vocabulary in the code comments (and in variable % names). % \begin{itemize} % \item \emph{Group}: index of the capturing group, $-1$ for % non-capturing groups. % \item \emph{Position}: each token in the query is labelled by an % integer \meta{position}, with $\texttt{min_pos} - 1 \leq % \meta{position} \leq \texttt{max_pos}$. The lowest and highest % positions correspond to imaginary begin and end markers (with % inaccessible category code and character code). % \item \emph{Query}: the token list to which we apply the regular % expression. % \item \emph{State}: each state of the \textsc{nfa} is labelled by an % integer \meta{state} with $\texttt{min_state} \leq \meta{state} < % \texttt{max_state}$. % \item \emph{Active thread}: state of the \textsc{nfa} that is reached % when reading the query token list for the matching. Those threads % are ordered according to the greediness of quantifiers. % \item \emph{Step}: used when matching, starts at $0$, incremented % every time a character is read, and is not reset when searching % for repeated matches. The integer \cs{l_@@_step_int} is a % unique id for all the steps of the matching algorithm. % \end{itemize} % % To achieve a good performance, we abuse \TeX{}'s registers in two % ways. We access registers directly by number rather than tying them % to control sequence using \cs{int_new:N} and other allocation % functions. And we store integers in \tn{dimen} registers in scaled % points (\texttt{sp}), using \TeX{}'s implicit conversion from % dimensions to integers in some contexts. Specifically, the registers % are used as follows. When compiling, \tn{toks} registers are used % under the hood by functions from the \pkg{l3tl-build} module. When % building, % \begin{itemize} % \item \tn{toks}\meta{state} holds the tests and actions to perform % in the \meta{state} of the \textsc{nfa}. % \item (Not implemented yet.) % \tn{skip}$i$ has the form \meta{group id} \texttt{plus} % \meta{left state} \texttt{minus} \meta{right state}. % \end{itemize} % When matching, % \begin{itemize} % \item \tn{dimen}\meta{state} is equal to the last \meta{step} in % which the \meta{state} was active. % \item (Currently, we use \tn{skip} instead of \tn{dimen}.) % \tn{dimen}\meta{thread}, with $\texttt{min_active} \leq % \meta{thread} < \texttt{max_active}$, is equal to the % \meta{state} in which the \meta{thread} currently is. The % \meta{threads} or ordered starting from the best to the least % preferred. % \item \tn{toks}\meta{thread} holds the submatch information for the % \meta{thread}, as the contents of a property list. % \item \tn{muskip}\meta{position} holds as its main and stretch % components the character and category code of the token at this % \meta{position} in the query. % \item \tn{toks}\meta{position} holds \meta{tokens} which \texttt{o}- % and \texttt{x}-expand to the \meta{position}-th token in the query. % \item \tn{skip} registers hold the value of end-points of all % submatches as would be extracted by the \cs{regex_extract} % functions. Since smaller \tn{skip} registers are used, the minimum % index is twice \texttt{max_state}, and the used registers go up to % \cs{l_@@_submatch_int}. They are organized in blocks of % \texttt{capturing_group}, each block corresponding to one match % with all its submatches stored in consecutive \tn{skip}s. % \end{itemize} % \tn{count} registers are not abused, which means that we can safely % use named integers in this module. Note that \tn{box} registers are % not abused either; maybe we could leverage those for some purpose. % % The code is structured as follows. Variables are introduced in the % relevant section. First we present some generic helper functions. Then % comes the code for compiling a regular expression, and for showing the % result of the compilation. The building phase converts a compiled % regex to \textsc{nfa} states, and the automaton is run by the code in % the following section. The only remaining brick is parsing the % replacement text and performing the replacement. We are then ready for % all the user functions. Finally, messages, and a little bit of tracing % code. % % \subsection{Helpers} % % \begin{macro}[aux]{\tl_to_str:V} % A variant we need for the |\u| escape in the replacement text. % \begin{macrocode} \cs_generate_variant:Nn \tl_to_str:n { V } % \end{macrocode} % \end{macro} % % \subsubsection{Constants and variables} % % \begin{macro}[aux]{\@@_tmp:w} % Temporary function used for various short-term purposes. % \begin{macrocode} \cs_new:Npn \@@_tmp:w { } % \end{macrocode} % \end{macro} % % \begin{variable} % { % \l_@@_internal_a_tl, \l_@@_internal_b_tl, % \l_@@_internal_a_int, \l_@@_internal_b_int, % \l_@@_internal_c_int, \l_@@_internal_bool, % \l_@@_internal_seq, \g_@@_internal_tl, % } % Temporary variables used for various purposes. % \begin{macrocode} \tl_new:N \l_@@_internal_a_tl \tl_new:N \l_@@_internal_b_tl \int_new:N \l_@@_internal_a_int \int_new:N \l_@@_internal_b_int \int_new:N \l_@@_internal_c_int \bool_new:N \l_@@_internal_bool \seq_new:N \l_@@_internal_seq \tl_new:N \g_@@_internal_tl % \end{macrocode} % \end{variable} % % \begin{variable}{\c_@@_no_match_regex} % This regular expression matches nothing, but is still a valid % regular expression. We could use a failing assertion, but I went for % an empty class. It is used as the initial value for regular % expressions declared using \cs{regex_new:N}. % \begin{macrocode} \tl_const:Nn \c_@@_no_match_regex { \@@_branch:n { \@@_class:NnnnN \c_true_bool { } { 1 } { 0 } \c_true_bool } } % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_balance_int} % The first thing we do when matching is to go once through the query % token list and store the information for each token as \tn{muskip} % and \tn{toks} registers. During this phase, \cs{l_@@_balance_int} % counts the balance of begin-group and end-group character tokens % which appear before a given point in the token list, and we store it % as the shrink component of each \tn{muskip} register. This variable % is also used to keep track of the balance in the replacement text. % \begin{macrocode} \int_new:N \l_@@_balance_int % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_cs_name_tl} % This variable is used in \cs{@@_item_cs:n} to store the csname of % the currently-tested token when the regex contains a sub-regex for % testing csnames. % \begin{macrocode} \tl_new:N \l_@@_cs_name_tl % \end{macrocode} % \end{variable} % % \subsubsection{Testing characters} % % \begin{macro}[int]{\@@_break_point:TF} % \begin{macro}[int]{\@@_break_true:w} % When testing whether a character of the query token list matches % a given character class in the regular expression, we often % have to test it against several ranges of characters, checking % if any one of those matches. This is done with a structure like % \begin{quote} % \meta{test1} \ldots{} \meta{test$\sb{n}$} \\ % \cs{@@_break_point:TF} \Arg{true code} \Arg{false code} % \end{quote} % If any of the tests succeeds, it calls \cs{@@_break_true:w}, % which cleans up and leaves \meta{true code} in the input stream. % Otherwise, \cs{@@_break_point:TF} leaves the \meta{false code} % in the input stream. % \begin{macrocode} \cs_new_protected:Npn \@@_break_true:w #1 \@@_break_point:TF #2 #3 {#2} \cs_new_protected:Npn \@@_break_point:TF #1 #2 { #2 } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}[int]{\@@_item_reverse:n} % This function makes showing regular expressions easier, and lets us % define |\D| in terms of |\d| for instance. There is a subtlety: the % end of the query is marked by $-2$, and will thus match |\D| and % other negated properties; this case is caught by another part of % the code. % \begin{macrocode} \cs_new_protected:Npn \@@_item_reverse:n #1 { #1 \@@_break_point:TF { } \@@_break_true:w } % \end{macrocode} % \end{macro} % % \begin{macro}[int] % {\@@_item_caseful_equal:n, \@@_item_caseful_range:nn} % Simple comparisons triggering \cs{@@_break_true:w} when true. % \begin{macrocode} \cs_new_protected:Npn \@@_item_caseful_equal:n #1 { \if_int_compare:w #1 = \l_@@_current_char_int \exp_after:wN \@@_break_true:w \fi: } \cs_new_protected:Npn \@@_item_caseful_range:nn #1 #2 { \reverse_if:N \if_int_compare:w #1 > \l_@@_current_char_int \reverse_if:N \if_int_compare:w #2 < \l_@@_current_char_int \exp_after:wN \exp_after:wN \exp_after:wN \@@_break_true:w \fi: \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}[int] % {\@@_item_caseless_equal:n, \@@_item_caseless_range:nn} % For caseless matching, we perform the test both on the % \texttt{current_char} and on the \texttt{case_changed_char}. Before % doing the second set of tests, we make sure that % \texttt{case_changed_char} has been computed. % \begin{macrocode} \cs_new_protected:Npn \@@_item_caseless_equal:n #1 { \if_int_compare:w #1 = \l_@@_current_char_int \exp_after:wN \@@_break_true:w \fi: \if_int_compare:w \l_@@_case_changed_char_int = \c_max_int \@@_compute_case_changed_char: \fi: \if_int_compare:w #1 = \l_@@_case_changed_char_int \exp_after:wN \@@_break_true:w \fi: } \cs_new_protected:Npn \@@_item_caseless_range:nn #1 #2 { \reverse_if:N \if_int_compare:w #1 > \l_@@_current_char_int \reverse_if:N \if_int_compare:w #2 < \l_@@_current_char_int \exp_after:wN \exp_after:wN \exp_after:wN \@@_break_true:w \fi: \fi: \if_int_compare:w \l_@@_case_changed_char_int = \c_max_int \@@_compute_case_changed_char: \fi: \reverse_if:N \if_int_compare:w #1 > \l_@@_case_changed_char_int \reverse_if:N \if_int_compare:w #2 < \l_@@_case_changed_char_int \exp_after:wN \exp_after:wN \exp_after:wN \@@_break_true:w \fi: \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}[int]{\@@_compute_case_changed_char:} % This function is called when \cs{l_@@_case_changed_char_int} has % not yet been computed (or rather, when it is set to the marker value % \cs{c_max_int}). If the current character code is in the range % $[65,90]$ (upper-case), then add $32$, making it lowercase. If it is % in the lower-case letter range $[97,122]$, subtract $32$. % \begin{macrocode} \cs_new_protected_nopar:Npn \@@_compute_case_changed_char: { \int_set_eq:NN \l_@@_case_changed_char_int \l_@@_current_char_int \if_int_compare:w \l_@@_current_char_int < \c_ninety_one \if_int_compare:w \l_@@_current_char_int < \c_sixty_five \else: \int_add:Nn \l_@@_case_changed_char_int { \c_thirty_two } \fi: \else: \if_int_compare:w \l_@@_current_char_int < \c_one_hundred_twenty_three \if_int_compare:w \l_@@_current_char_int < \c_ninety_seven \else: \int_sub:Nn \l_@@_case_changed_char_int { \c_thirty_two } \fi: \fi: \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}[int, EXP]{\@@_item_equal:n, \@@_item_range:nn} % Those must always be defined to expand to a \texttt{caseful} % (default) or \texttt{caseless} version, and not be protected: they % must expand when compiling, to hard-code which tests are caseless or % caseful. % \begin{macrocode} \cs_new_eq:NN \@@_item_equal:n ? \cs_new_eq:NN \@@_item_range:nn ? % \end{macrocode} % \end{macro} % % \begin{macro}[int]{\@@_item_catcode:nT, \@@_item_catcode_reverse:nT} % \begin{macro}[aux]{\@@_item_catcode:} % The argument is a sum of powers of $4$ with exponents given by the % allowed category codes (between $0$ and $13$). Dividing by a given % power of $4$ gives an odd result if and only if that category code % is allowed. If the catcode does not match, then skip the character % code tests which follow. % \begin{macrocode} \cs_new_protected:Npn \@@_item_catcode: { " \if_case:w \l_@@_current_catcode_int 1 \or: 4 \or: 10 \or: 40 \or: 100 \or: \or: 1000 \or: 4000 \or: 10000 \or: \or: 100000 \or: 400000 \or: 1000000 \or: 4000000 \else: 1*\c_zero \fi: } \cs_new_protected:Npn \@@_item_catcode:nT #1 { \if_int_odd:w \__int_eval:w #1 / \@@_item_catcode: \__int_eval_end: \exp_after:wN \use:n \else: \exp_after:wN \use_none:n \fi: } \cs_new_protected:Npn \@@_item_catcode_reverse:nT #1#2 { \@@_item_catcode:nT {#1} { \@@_item_reverse:n {#2} } } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}[int]{\@@_item_exact:nn, \@@_item_exact_cs:c} % This matches an exact \meta{category}-\meta{character code} pair, or % an exact control sequence. % \begin{macrocode} \cs_new_protected:Npn \@@_item_exact:nn #1#2 { \if_int_compare:w #1 = \l_@@_current_catcode_int \if_int_compare:w #2 = \l_@@_current_char_int \exp_after:wN \exp_after:wN \exp_after:wN \@@_break_true:w \fi: \fi: } \cs_new_protected:Npn \@@_item_exact_cs:c #1 { \int_compare:nNnTF \l_@@_current_catcode_int = \c_zero { \str_if_eq_x:nnTF { \exp_after:wN \exp_after:wN \exp_after:wN \cs_to_str:N \tex_the:D \tex_toks:D \l_@@_current_pos_int } { #1 } { \@@_break_true:w } { } } { } } % \end{macrocode} % \end{macro} % % \begin{macro}[int]{\@@_item_cs:n} % Match a control sequence (the argument is a compiled regex). % First test the catcode of the current token to be zero. % Then perform the matching test, and break if the csname % indeed matches. The three \cs{exp_after:wN} expand the contents % of the \tn{toks}\meta{current position} (of the form \cs{exp_not:n} % \Arg{control sequence}) to \meta{control sequence}. % We store the cs name before building states for the cs, as those % states may overlap with toks registers storing the user's input. % \begin{macrocode} \cs_new_protected:Npn \@@_item_cs:n #1 { \int_compare:nNnT \l_@@_current_catcode_int = \c_zero { \group_begin: \tl_set:Nx \l_@@_cs_name_tl { \exp_after:wN \exp_after:wN \exp_after:wN \cs_to_str:N \tex_the:D \tex_toks:D \l_@@_current_pos_int } \@@_single_match: \@@_disable_submatches: \@@_build_for_cs:n {#1} \bool_set_eq:NN \l_@@_saved_success_bool \g_@@_success_bool \exp_args:NV \@@_match:n \l_@@_cs_name_tl \if_meaning:w \c_true_bool \g_@@_success_bool \group_insert_after:N \@@_break_true:w \fi: \bool_gset_eq:NN \g_@@_success_bool \l_@@_saved_success_bool \group_end: } } % \end{macrocode} % \end{macro} % % \subsubsection{Character property tests} % % \begin{macro}[aux] % { % \@@_prop_d:, \@@_prop_h:, \@@_prop_s:, % \@@_prop_v:, \@@_prop_w:, \@@_prop_N: % } % Character property tests for |\d|, |\W|, \emph{etc.} These character % properties are not affected by the |(?i)| option. The characters % recognized by each one are as follows: |\d=[0-9]|, % |\w=[0-9A-Z_a-z]|, \verb*+\s=[\ \^^I\^^J\^^L\^^M]+, % \verb*+\h=[\ \^^I]+, |\v=[\^^J-\^^M]|, and the upper case % counterparts match anything that the lower case does not match. The % order in which the various tests appear is optimized for usual % mostly lower case letter text. % \begin{macrocode} \cs_new_protected_nopar:Npn \@@_prop_d: { \@@_item_caseful_range:nn \c_forty_eight { 57 } } % 0--9 \cs_new_protected_nopar:Npn \@@_prop_h: { \@@_item_caseful_equal:n \c_thirty_two % space \@@_item_caseful_equal:n \c_nine % tab } \cs_new_protected_nopar:Npn \@@_prop_s: { \@@_item_caseful_equal:n \c_thirty_two % space \@@_item_caseful_equal:n \c_nine % tab \@@_item_caseful_equal:n \c_ten % lf \@@_item_caseful_equal:n \c_twelve % ff \@@_item_caseful_equal:n \c_thirteen % cr } \cs_new_protected_nopar:Npn \@@_prop_v: { \@@_item_caseful_range:nn \c_ten \c_thirteen } % lf, vtab, ff, cr \cs_new_protected_nopar:Npn \@@_prop_w: { \@@_item_caseful_range:nn \c_ninety_seven { 122 } % a--z \@@_item_caseful_range:nn \c_sixty_five { 90 } % A--Z \@@_item_caseful_range:nn \c_forty_eight { 57 } % 0--9 \@@_item_caseful_equal:n { 95 } % _ } \cs_new_protected_nopar:Npn \@@_prop_N: { \@@_item_reverse:n { \@@_item_caseful_equal:n \c_ten } } % \end{macrocode} % \end{macro} % % \begin{macro}[aux] % { % \@@_posix_alnum:, \@@_posix_alpha:, \@@_posix_ascii:, % \@@_posix_blank:, \@@_posix_cntrl:, \@@_posix_digit:, % \@@_posix_graph:, \@@_posix_lower:, \@@_posix_print:, % \@@_posix_punct:, \@@_posix_space:, \@@_posix_upper:, % \@@_posix_word: , \@@_posix_xdigit: % } % \textsc{posix} properties. No surprise. % \begin{macrocode} \cs_new_protected_nopar:Npn \@@_posix_alnum: { \@@_posix_alpha: \@@_posix_digit: } \cs_new_protected_nopar:Npn \@@_posix_alpha: { \@@_posix_lower: \@@_posix_upper: } \cs_new_protected_nopar:Npn \@@_posix_ascii: { \@@_item_caseful_range:nn \c_zero \c_one_hundred_twenty_seven } \cs_new_eq:NN \@@_posix_blank: \@@_prop_h: \cs_new_protected_nopar:Npn \@@_posix_cntrl: { \@@_item_caseful_range:nn \c_zero { 31 } \@@_item_caseful_equal:n \c_one_hundred_twenty_seven } \cs_new_eq:NN \@@_posix_digit: \@@_prop_d: \cs_new_protected_nopar:Npn \@@_posix_graph: { \@@_item_caseful_range:nn { 33 } { 126 } } \cs_new_protected_nopar:Npn \@@_posix_lower: { \@@_item_caseful_range:nn \c_ninety_seven { 122 } } \cs_new_protected_nopar:Npn \@@_posix_print: { \@@_item_caseful_range:nn \c_thirty_two { 126 } } \cs_new_protected_nopar:Npn \@@_posix_punct: { \@@_item_caseful_range:nn { 33 } { 47 } \@@_item_caseful_range:nn { 58 } { 64 } \@@_item_caseful_range:nn { 91 } { 96 } \@@_item_caseful_range:nn { 123 } { 126 } } \cs_new_protected_nopar:Npn \@@_posix_space: { \@@_item_caseful_equal:n \c_thirty_two \@@_item_caseful_range:nn \c_nine \c_thirteen } \cs_new_protected_nopar:Npn \@@_posix_upper: { \@@_item_caseful_range:nn \c_sixty_five { 90 } } \cs_new_eq:NN \@@_posix_word: \@@_prop_w: \cs_new_protected_nopar:Npn \@@_posix_xdigit: { \@@_posix_digit: \@@_item_caseful_range:nn \c_sixty_five { 70 } \@@_item_caseful_range:nn \c_ninety_seven { 102 } } % \end{macrocode} % \end{macro} % % \subsubsection{Simple character escape} % % Before actually parsing the regular expression or the replacement % text, we go through them once, converting |\n| to the character $10$, % \emph{etc.} In this pass, we also convert any special character % (\texttt{*}, \texttt{?}, \texttt{\{}, etc.) or escaped alphanumeric % character into a marker indicating that this was a special sequence, % and replace escaped special characters and non-escaped alphanumeric % characters by markers indicating that those were \enquote{raw} % characters. The rest of the code can then avoid caring about escaping % issues (those can become quite complex to handle in combination with % ranges in character classes). % % Usage: \cs{@@_escape_use:nnnn} \meta{inline~1} \meta{inline~2} % \meta{inline~3} \Arg{token list} The \meta{token list} is converted to % a string, then read from left to right, interpreting backslashes as % escaping the next character. Unescaped characters are fed to the % function \meta{inline~1}, and escaped characters are fed to the function % \meta{inline~2} within an \texttt{x}-expansion context (typically those % functions perform some tests on their argument to decide how to output % them). The escape sequences |\a|, |\e|, |\f|, |\n|, |\r|, |\t| and % |\x| are recognized, and those are replaced by the corresponding % character, then fed to \meta{inline~3}. The result is then left in the % input stream. Spaces are ignored unless escaped. % % The conversion is mostly done within an \texttt{x}-expanding % assignment, except for the |\x| escape sequence, which is not amenable % to that in general. For this, we use the general framework of % \cs{__tl_build:Nw}. % % \begin{macro}[int]{\@@_escape_use:nnnn} % The result is built in \cs{l_@@_internal_a_tl}, which is then % left in the input stream. Go through |#4| once, applying |#1|, % |#2|, or |#3| as relevant to each character (after de-escaping % it). Note that we cannot replace \cs{tl_set:Nx} and % \cs{__tl_build_one:o} by a single call to \cs{__tl_build_one:x}, because % the \texttt{x}-expanding assignment may be interrupted by |\x|. % \begin{macrocode} \cs_new_protected:Npn \@@_escape_use:nnnn #1#2#3#4 { % \trace_push:nnn { regex } { 1 } { @@_escape_use:nnnn } \__tl_build:Nw \l_@@_internal_a_tl \cs_set_nopar:Npn \@@_escape_unescaped:N ##1 { #1 } \cs_set_nopar:Npn \@@_escape_escaped:N ##1 { #2 } \cs_set_nopar:Npn \@@_escape_raw:N ##1 { #3 } \int_set:Nn \tex_escapechar:D { 92 } \__str_gset_other:Nn \g_@@_internal_tl { #4 } \tl_set:Nx \l_@@_internal_b_tl { \exp_after:wN \@@_escape_loop:N \g_@@_internal_tl { break } \__prg_break_point: } \__tl_build_one:o \l_@@_internal_b_tl \__tl_build_end: % \trace_pop:nnn { regex } { 1 } { @@_escape_use:nnnn } \l_@@_internal_a_tl } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_escape_loop:N} % \begin{macro}[aux]+\@@_escape_\:w+ % \cs{@@_escape_loop:N} reads one character: if it is special % (space, backslash, or end-marker), perform the associated action, % otherwise it is simply an unescaped character. After a backslash, % the same is done, but unknown characters are \enquote{escaped}. % \begin{macrocode} \cs_new:Npn \@@_escape_loop:N #1 { \cs_if_exist_use:cF { @@_escape_\token_to_str:N #1:w } { \@@_escape_unescaped:N #1 } \@@_escape_loop:N } \cs_new_nopar:cpn { @@_escape_ \c_backslash_str :w } \@@_escape_loop:N #1 { \cs_if_exist_use:cF { @@_escape_/\token_to_str:N #1:w } { \@@_escape_escaped:N #1 } \@@_escape_loop:N } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}[aux] % {\@@_escape_unescaped:N, \@@_escape_escaped:N, \@@_escape_raw:N} % Those functions are never called before being given a new meaning, % so their definitions here don't matter. % \begin{macrocode} \cs_new_eq:NN \@@_escape_unescaped:N ? \cs_new_eq:NN \@@_escape_escaped:N ? \cs_new_eq:NN \@@_escape_raw:N ? % \end{macrocode} % \end{macro} % % \begin{macro}[aux] % { % \@@_escape_break:w, \@@_escape_/break:w, % \@@_escape_/a:w, \@@_escape_/e:w, \@@_escape_/f:w, % \@@_escape_/n:w, \@@_escape_/r:w, \@@_escape_/t:w % } % \begin{macro}[aux]+\@@_escape_ :w+ % The loop is ended upon seeing the end-marker % \enquote{\texttt{break}}, with an error if the string ended in a % backslash. Spaces are ignored, and |\a|, |\e|, |\f|, |\n|, |\r|, % |\t| take their meaning here. % \begin{macrocode} \cs_new_eq:NN \@@_escape_break:w \__prg_break: \cs_new_nopar:cpn { @@_escape_/break:w } { \if_false: { \fi: } \__msg_kernel_error:nn { regex } { trailing-backslash } \exp_after:wN \use_none:n \exp_after:wN { \if_false: } \fi: } \cs_new_nopar:cpn { @@_escape_~:w } { } \cs_new_nopar:cpx { @@_escape_/a:w } { \exp_not:N \@@_escape_raw:N \iow_char:N \^^G } \cs_new_nopar:cpx { @@_escape_/t:w } { \exp_not:N \@@_escape_raw:N \iow_char:N \^^I } \cs_new_nopar:cpx { @@_escape_/n:w } { \exp_not:N \@@_escape_raw:N \iow_char:N \^^J } \cs_new_nopar:cpx { @@_escape_/f:w } { \exp_not:N \@@_escape_raw:N \iow_char:N \^^L } \cs_new_nopar:cpx { @@_escape_/r:w } { \exp_not:N \@@_escape_raw:N \iow_char:N \^^M } \cs_new_nopar:cpx { @@_escape_/e:w } { \exp_not:N \@@_escape_raw:N \iow_char:N \^^[ } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}[aux]{\@@_escape_/x:w} % \begin{macro}[aux]{\@@_escape_x_end:w, \@@_escape_x_large:n} % When |\x| is encountered, \cs{@@_escape_x_test:N} is responsible % for grabbing some hexadecimal digits, and feeding the result to % \cs{@@_escape_x_end:w}. If the number is $<256$, then it is % turned into a byte and fed to \cs{@@_escape_raw:N}. Otherwise, % interrupt the assignment, and either produce an error, or use a % standard \tn{lowercase} trick depending on the precise value. % \begin{macrocode} \cs_new:cpn { @@_escape_/x:w } \@@_escape_loop:N { \exp_after:wN \@@_escape_x_end:w \__int_value:w "0 \@@_escape_x_test:N } \cs_new:Npn \@@_escape_x_end:w #1 ; { \int_compare:nNnTF {#1} < \c_two_hundred_fifty_six { \exp_last_unbraced:Nf \@@_escape_raw:N { \__str_output_byte:n {#1} } } { \@@_escape_x_large:n {#1} } } \group_begin: \char_set_catcode_other:n { 0 } \cs_new:Npn \@@_escape_x_large:n #1 { \if_false: { \fi: } \__tl_build_one:o \l_@@_internal_b_tl \int_compare:nNnTF {#1} > \c_max_char_int { \__msg_kernel_error:nnx { regex } { x-overflow } {#1} \tl_set:Nx \l_@@_internal_b_tl { \if_false: } \fi: } { \char_set_lccode:nn { \c_zero } {#1} \tl_to_lowercase:n { \tl_set:Nx \l_@@_internal_b_tl { \if_false: } \fi: \@@_escape_raw:N ^^@ } } } \group_end: % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}[aux]{\@@_escape_x_test:N, \@@_escape_x_testii:N} % Find out whether the first character is a left brace (allowing any % number of hexadecimal digits), or not (allowing up to two % hexadecimal digits). We need to check for the end-of-string marker. % Eventually, call either \cs{@@_escape_x_loop:N} or % \cs{@@_escape_x:N}. % \begin{macrocode} \cs_new:Npn \@@_escape_x_test:N #1 { \str_if_eq_x:nnTF {#1} { break } { ; } { \if_charcode:w \c_space_token #1 \exp_after:wN \@@_escape_x_test:N \else: \exp_after:wN \@@_escape_x_testii:N \exp_after:wN #1 \fi: } } \cs_new:Npn \@@_escape_x_testii:N #1 { \if_charcode:w \c_left_brace_str #1 \exp_after:wN \@@_escape_x_loop:N \else: \__str_hexadecimal_use:NTF #1 { \exp_after:wN \@@_escape_x:N } { ; \exp_after:wN \@@_escape_loop:N \exp_after:wN #1 } \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_escape_x:N} % This looks for the second digit in the unbraced case. % \begin{macrocode} \cs_new:Npn \@@_escape_x:N #1 { \str_if_eq_x:nnTF {#1} { break } { ; } { \__str_hexadecimal_use:NTF #1 { ; \@@_escape_loop:N } { ; \@@_escape_loop:N #1 } } } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_escape_x_loop:N, \@@_escape_x_loop_error:} % Grab hexadecimal digits, skip spaces, and at the end, check that % there is a right brace, otherwise raise an error outside the % assignment. % \begin{macrocode} \cs_new:Npn \@@_escape_x_loop:N #1 { \str_if_eq_x:nnTF {#1} { break } { ; \@@_escape_x_loop_error:n { } {#1} } { \__str_hexadecimal_use:NTF #1 { \@@_escape_x_loop:N } { \token_if_eq_charcode:NNTF \c_space_token #1 { \@@_escape_x_loop:N } { ; \exp_after:wN \token_if_eq_charcode:NNTF \c_right_brace_str #1 { \@@_escape_loop:N } { \@@_escape_x_loop_error:n {#1} } } } } } \cs_new:Npn \@@_escape_x_loop_error:n #1 { \if_false: { \fi: } \__tl_build_one:o \l_@@_internal_b_tl \__msg_kernel_error:nnx { regex } { x-missing-rbrace } {#1} \tl_set:Nx \l_@@_internal_b_tl { \if_false: } \fi: \@@_escape_loop:N #1 } % \end{macrocode} % \end{macro} % % \begin{macro}[EXP, aux] % {\@@_char_if_alphanumeric:NTF, \@@_char_if_special:NTF} % These two tests are used in the first pass when parsing a regular % expression. That pass is responsible for finding escaped and % non-escaped characters, and recognizing which ones have special % meanings and which should be interpreted as \enquote{raw} % characters. Namely, % \begin{itemize} % \item alphanumerics are \enquote{raw} if they are not escaped, and % may have a special meaning when escaped; % \item non-alphanumeric printable ascii characters are % \enquote{raw} if they are escaped, and may have a special % meaning when not escaped; % \item characters other than printable ascii are always % \enquote{raw}. % \end{itemize} % The code is ugly, and highly based on magic numbers and the ascii % codes of characters. This is mostly unavoidable for performance % reasons. Maybe the tests can be optimized a little bit more. % Here, \enquote{alphanumeric} means \texttt{0}--\texttt{9}, % \texttt{A}--\texttt{Z}, \texttt{a}--\texttt{z}; % \enquote{special} character means non-alphanumeric % but printable ascii, from space (hex \texttt{20}) to % \texttt{del} (hex \texttt{7E}). % \begin{macrocode} \prg_new_conditional:Npnn \@@_char_if_special:N #1 { TF } { \if_int_compare:w `#1 < \c_ninety_one \if_int_compare:w `#1 < \c_fifty_eight \if_int_compare:w `#1 < \c_forty_eight \if_int_compare:w `#1 < \c_thirty_two \prg_return_false: \else: \prg_return_true: \fi: \else: \prg_return_false: \fi: \else: \if_int_compare:w `#1 < \c_sixty_five \prg_return_true: \else: \prg_return_false: \fi: \fi: \else: \if_int_compare:w `#1 < \c_one_hundred_twenty_three \if_int_compare:w `#1 < \c_ninety_seven \prg_return_true: \else: \prg_return_false: \fi: \else: \if_int_compare:w `#1 < \c_one_hundred_twenty_seven \prg_return_true: \else: \prg_return_false: \fi: \fi: \fi: } \prg_new_conditional:Npnn \@@_char_if_alphanumeric:N #1 { TF } { \if_int_compare:w `#1 < \c_ninety_one \if_int_compare:w `#1 < \c_fifty_eight \if_int_compare:w `#1 < \c_forty_eight \prg_return_false: \else: \prg_return_true: \fi: \else: \if_int_compare:w `#1 < \c_sixty_five \prg_return_false: \else: \prg_return_true: \fi: \fi: \else: \if_int_compare:w `#1 < \c_one_hundred_twenty_three \if_int_compare:w `#1 < \c_ninety_seven \prg_return_false: \else: \prg_return_true: \fi: \else: \prg_return_false: \fi: \fi: } % \end{macrocode} % \end{macro} % % \subsection{Compiling} % % A regular expression starts its life as a string of characters. In % this section, we convert it to internal instructions, resulting in a % \enquote{compiled} regular expression. This compiled expression is % then turned into states of an automaton in the building % phase. Compiled regular expressions consist of the following: % \begin{itemize} % \item \cs{@@_class:NnnnN} \meta{boolean} \Arg{tests} \Arg{min} % \Arg{more} \meta{lazyness} % \item \cs{@@_group:nnnN} \Arg{branches} \Arg{min} \Arg{more} % \meta{lazyness}, also \cs{@@_group_no_capture:nnnN} and % \cs{@@_group_resetting:nnnN} with the same syntax. % \item \cs{@@_branch:n} \Arg{contents} % \item \cs{@@_command_K:} % \item \cs{@@_assertion:Nn} \meta{boolean} \Arg{assertion test}, % where the \meta{assertion test} is \cs{@@_b_test:} or % |{|\cs{@@_anchor:N} \meta{integer}|}| % \end{itemize} % Tests can be the following: % \begin{itemize} % \item \cs{@@_item_caseful_equal:n} \Arg{char code} % \item \cs{@@_item_caseless_equal:n} \Arg{char code} % \item \cs{@@_item_caseful_range:nn} \Arg{min} \Arg{max} % \item \cs{@@_item_caseless_range:nn} \Arg{min} \Arg{max} % \item \cs{@@_item_catcode:nT} \Arg{catcode bitmap} \Arg{tests} % \item \cs{@@_item_catcode_reverse:nT} \Arg{catcode bitmap} \Arg{tests} % \item \cs{@@_item_reverse:n} \Arg{tests} % \item \cs{@@_item_exact:nn} \Arg{catcode} \Arg{char code} % \item \cs{@@_item_exact_cs:c} \Arg{csname} % \item \cs{@@_item_cs:n} \Arg{compiled regex} % \end{itemize} % % \subsubsection{Variables used when compiling} % % \begin{variable}{\l_@@_group_level_int} % We make sure to open the same number of groups as we close. % \begin{macrocode} \int_new:N \l_@@_group_level_int % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_mode_int} % While compiling, ten modes are recognized, labelled $-63$, $-23$, % $-6$, $-2$, $0$, $2$, $3$, $6$, $23$, $63$. See % section~\ref{sec:regex-modes}. % \begin{macrocode} \int_new:N \l_@@_mode_int % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_catcodes_int, \l_@@_default_catcodes_int} % \begin{variable}{\l_@@_catcodes_bool} % We wish to allow constructions such as |\c[^BE](..\cL[a-z]..)|, % where the outer catcode test applies to the whole group, but is % superseded by the inner catcode test. For this to work, we need to % keep track of lists of allowed category codes: % \cs{l_@@_catcodes_int} and \cs{l_@@_default_catcodes_int} are % bitmaps, sums of $4^c$, for all allowed catcodes $c$. The latter is % local to each capturing group, and we reset % \cs{l_@@_catcodes_int} to that value after each character or % class, changing it only when encountering a |\c| escape. The boolean % records whether the list of categories of a catcode test has to be % inverted: compare |\c[^BE]| and |\c[BE]|. % \begin{macrocode} \int_new:N \l_@@_catcodes_int \int_new:N \l_@@_default_catcodes_int \bool_new:N \l_@@_catcodes_bool % \end{macrocode} % \end{variable} % \end{variable} % % \begin{variable} % { % \c_@@_catcode_C_int, \c_@@_catcode_B_int, \c_@@_catcode_E_int, % \c_@@_catcode_M_int, \c_@@_catcode_T_int, \c_@@_catcode_P_int, % \c_@@_catcode_U_int, \c_@@_catcode_D_int, \c_@@_catcode_S_int, % \c_@@_catcode_L_int, \c_@@_catcode_O_int, \c_@@_catcode_A_int % } % \begin{variable}{\c_@@_all_catcodes_int} % Constants: $4^c$ for each category, and the sum of all powers of $4$. % \begin{macrocode} \int_const:Nn \c_@@_catcode_C_int { "1 } \int_const:Nn \c_@@_catcode_B_int { "4 } \int_const:Nn \c_@@_catcode_E_int { "10 } \int_const:Nn \c_@@_catcode_M_int { "40 } \int_const:Nn \c_@@_catcode_T_int { "100 } \int_const:Nn \c_@@_catcode_P_int { "1000 } \int_const:Nn \c_@@_catcode_U_int { "4000 } \int_const:Nn \c_@@_catcode_D_int { "10000 } \int_const:Nn \c_@@_catcode_S_int { "100000 } \int_const:Nn \c_@@_catcode_L_int { "400000 } \int_const:Nn \c_@@_catcode_O_int { "1000000 } \int_const:Nn \c_@@_catcode_A_int { "4000000 } \int_const:Nn \c_@@_all_catcodes_int { "5515155 } % \end{macrocode} % \end{variable} % \end{variable} % % \begin{variable}{\l_@@_internal_regex} % The compilation step stores its result in this variable. % \begin{macrocode} \cs_new_eq:NN \l_@@_internal_regex \c_@@_no_match_regex % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_show_prefix_seq} % This sequence holds the prefix that makes up the line displayed to % the user. The various items must be removed from the right, which is % tricky with a token list, hence we use a sequence. % \begin{macrocode} \seq_new:N \l_@@_show_prefix_seq % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_show_lines_int} % A hack. To know whether a given class has a single item in it or % not, we count the number of lines when showing the class. % \begin{macrocode} \int_new:N \l_@@_show_lines_int % \end{macrocode} % \end{variable} % % \subsubsection{Generic helpers used when compiling} % % \begin{macro}[int]{\@@_get_digits:NTFw} % \begin{macro}[aux, rEXP]{\@@_get_digits_loop:w} % If followed by some raw digits, collect them one by one in the % integer variable |#1|, and take the \texttt{true} branch. Otherwise, % take the \texttt{false} branch. % \begin{macrocode} \cs_new_protected:Npn \@@_get_digits:NTFw #1#2#3#4#5 { \@@_if_raw_digit:NNTF #4 #5 { #1 = #5 \@@_get_digits_loop:nw {#2} } { #3 #4 #5 } } \cs_new:Npn \@@_get_digits_loop:nw #1#2#3 { \@@_if_raw_digit:NNTF #2 #3 { #3 \@@_get_digits_loop:nw {#1} } { \scan_stop: #1 #2 #3 } } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}[aux, EXP]{\@@_if_raw_digit:NNTF} % Test used when grabbing digits for the |{m,n}| quantifier. % It only accepts non-escaped digits. % \begin{macrocode} \prg_new_conditional:Npnn \@@_if_raw_digit:NN #1#2 { TF } { \if_meaning:w \@@_compile_raw:N #1 \if_int_compare:w \c_one < 1 #2 \exp_stop_f: \prg_return_true: \else: \prg_return_false: \fi: \else: \prg_return_false: \fi: } % \end{macrocode} % \end{macro} % % \subsubsection{Mode} % \label{sec:regex-modes} % % When compiling the \textsc{nfa} corresponding to a given regex string, % we can be in ten distinct modes, which we label by some magic numbers: % \begin{itemize} % \item[-6] |[\c{...}]| control sequence in a class, % \item[-2] |\c{...}| control sequence, % \item[0] |...| outer, % \item[2] |\c...| catcode test, % \item[6] |[\c...]| catcode test in a class, % \item[-63] |[\c{[...]}]| class inside mode $-6$, % \item[-23] |\c{[...]}| class inside mode $-2$, % \item[3] |[...]| class inside mode $-3$, % \item[23] |\c[...]| class inside mode $2$, % \item[63] |[\c[...]]| class inside mode $6$. % \end{itemize} % This list is exhaustive, because |\c| escape sequences cannot be % nested, and character classes cannot be nested directly. The choice of % numbers is such as to optimize the most useful tests, and make % transitions from one mode to another as simple as possible. % \begin{itemize} % \item Even modes mean that we are not directly in a character class. % In this case, a left bracket appends $3$ to the mode. In a % character class, a right bracket changes the mode as $m\to % (m-15)/13$, truncated. % \item Grouping, assertion, and anchors are allowed in non-positive % even modes ($0$, $-2$, $-6$), and do not change the % mode. Otherwise, they trigger an error. % \item A left bracket is special in even modes, appending $3$ to the % mode; in those modes, quantifiers and the dot are recognized, and % the right bracket is normal. In odd modes (within classes), the % left bracket is normal, but the right bracket ends the class, % changing the mode from $m$ to $(m-15)/13$, truncated; also, ranges % are recognized. % \item In non-negative modes, left and right braces are normal. In % negative modes, however, left braces trigger a warning; right % braces end the control sequence, going from $-2$ to $0$ or $-6$ to % $3$, with error recovery for odd modes. % \item Properties (such as the |\d| character class) can appear in % any mode. % \end{itemize} % % \begin{macro}[int, EXP]{\@@_if_in_class:TF} % Test whether we are directly in a character class (at the innermost % level of nesting). There, many escape sequences are not recognized, % and special characters are normal. Also, for every raw character, we % must look ahead for a possible raw dash. % \begin{macrocode} \cs_new_nopar:Npn \@@_if_in_class:TF { \if_int_odd:w \l_@@_mode_int \exp_after:wN \use_i:nn \else: \exp_after:wN \use_ii:nn \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}[int, EXP]{\@@_if_in_cs:TF} % Right braces are special only directly inside control sequences (at % the inner-most level of nesting, not counting groups). % \begin{macrocode} \cs_new_nopar:Npn \@@_if_in_cs:TF { \if_int_odd:w \l_@@_mode_int \exp_after:wN \use_ii:nn \else: \if_int_compare:w \l_@@_mode_int < \c_zero \exp_after:wN \exp_after:wN \exp_after:wN \use_i:nn \else: \exp_after:wN \exp_after:wN \exp_after:wN \use_ii:nn \fi: \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}[int, EXP]{\@@_if_in_class_or_catcode:TF} % Assertions are only allowed in modes $0$, $-2$, and $-6$, % \emph{i.e.}, even, non-positive modes. % \begin{macrocode} \cs_new_nopar:Npn \@@_if_in_class_or_catcode:TF { \if_int_odd:w \l_@@_mode_int \exp_after:wN \use_i:nn \else: \if_int_compare:w \l_@@_mode_int > \c_zero \exp_after:wN \exp_after:wN \exp_after:wN \use_i:nn \else: \exp_after:wN \exp_after:wN \exp_after:wN \use_ii:nn \fi: \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}[int, EXP]{\@@_if_within_catcode:TF} % This test takes the true branch if we are in a catcode test, either % immediately following it (modes $2$ and $6$) or in a class on which % it applies (modes $23$ and $63$). This is used to tweak how left % brackets behave in modes $2$ and $6$. % \begin{macrocode} \cs_new_nopar:Npn \@@_if_within_catcode:TF { \if_int_compare:w \l_@@_mode_int > \c_zero \exp_after:wN \use_i:nn \else: \exp_after:wN \use_ii:nn \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_chk_c_allowed:T} % The |\c| escape sequence is only allowed in modes $0$ and $3$, % \emph{i.e.}, not within any other |\c| escape sequence. % \begin{macrocode} \cs_new_protected:Npn \@@_chk_c_allowed:T { \if_int_compare:w \l_@@_mode_int = \c_zero \exp_after:wN \use:n \else: \if_int_compare:w \l_@@_mode_int = \c_three \exp_after:wN \exp_after:wN \exp_after:wN \use:n \else: \__msg_kernel_error:nn { regex } { c-bad-mode } \exp_after:wN \exp_after:wN \exp_after:wN \use_none:n \fi: \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_mode_quit_c:} % This function changes the mode as it is needed just after a catcode % test. % \begin{macrocode} \cs_new_protected:Npn \@@_mode_quit_c: { \if_int_compare:w \l_@@_mode_int = \c_two \l_@@_mode_int = \c_zero \else: \if_int_compare:w \l_@@_mode_int = \c_six \l_@@_mode_int = \c_three \fi: \fi: } % \end{macrocode} % \end{macro} % % \subsubsection{Framework} % % \begin{macro}[int]{\@@_compile:w, \@@_compile_end:} % Used when compiling a user regex or a regex for the |\c{...}| escape % sequence within another regex. Start building a token list within a % group (with \texttt{x}-expansion at the outset), and set a few % variables (group level, catcodes), then start the first branch. At % the end, make sure there are no dangling classes nor groups, close % the last branch: we are done building \cs{l_@@_internal_regex}. % \begin{macrocode} \cs_new_protected_nopar:Npn \@@_compile:w { \__tl_build_x:Nw \l_@@_internal_regex \int_zero:N \l_@@_group_level_int \int_set_eq:NN \l_@@_default_catcodes_int \c_@@_all_catcodes_int \int_set_eq:NN \l_@@_catcodes_int \l_@@_default_catcodes_int \cs_set_nopar:Npn \@@_item_equal:n { \@@_item_caseful_equal:n } \cs_set_nopar:Npn \@@_item_range:nn { \@@_item_caseful_range:nn } \__tl_build_one:n { \@@_branch:n { \if_false: } \fi: } } \cs_new_protected_nopar:Npn \@@_compile_end: { \@@_if_in_class:TF { \__msg_kernel_error:nn { regex } { missing-rbrack } \use:c { @@_compile_]: } \prg_do_nothing: \prg_do_nothing: } { } \if_int_compare:w \l_@@_group_level_int > \c_zero \__msg_kernel_error:nnx { regex } { missing-rparen } { \int_use:N \l_@@_group_level_int } \prg_replicate:nn { \l_@@_group_level_int } { \__tl_build_one:n { \if_false: { \fi: } \if_false: { \fi: } { 1 } { 0 } \c_true_bool } \__tl_build_end: \__tl_build_one:o \l_@@_internal_regex } \fi: \__tl_build_one:n { \if_false: { \fi: } } \__tl_build_end: } % \end{macrocode} % \end{macro} % % \begin{macro}[int]{\@@_compile:n} % The compilation is done between \cs{@@_compile:w} and % \cs{@@_compile_end:}, starting in mode~$0$. Then % \cs{@@_escape_use:nnnn} distinguishes special characters, escaped % alphanumerics, and raw characters, interpreting |\a|, |\x| and other % sequences. The $4$ trailing \cs{prg_do_nothing:} are needed because % some functions defined later look up to $4$ tokens ahead. Before % ending, make sure that any |\c{...}| is properly closed. No need to % check that brackets are closed properly since \cs{@@_compile_end:} % does that. However, catch the case of a trailing |\cL| % construction. % \begin{macrocode} \cs_new_protected:Npn \@@_compile:n #1 { \@@_compile:w \int_set:Nn \tex_escapechar:D { 92 } \int_set_eq:NN \l_@@_mode_int \c_zero \@@_escape_use:nnnn { \@@_char_if_special:NTF ##1 \@@_compile_special:N \@@_compile_raw:N ##1 } { \@@_char_if_alphanumeric:NTF ##1 \@@_compile_escaped:N \@@_compile_raw:N ##1 } { \@@_compile_raw:N ##1 } { #1 } \prg_do_nothing: \prg_do_nothing: \prg_do_nothing: \prg_do_nothing: \int_compare:nNnT \l_@@_mode_int = \c_two { \__msg_kernel_error:nn { regex } { c-trailing } } \int_compare:nNnT \l_@@_mode_int < \c_zero { \__msg_kernel_error:nn { regex } { c-missing-rbrace } \@@_compile_end: \@@_compile_one:x { \@@_item_cs:n { \exp_not:o \l_@@_internal_regex } } \prg_do_nothing: \prg_do_nothing: \prg_do_nothing: \prg_do_nothing: } \@@_compile_end: } % \end{macrocode} % \end{macro} % % \begin{macro}[int]{\@@_compile_escaped:N, \@@_compile_special:N} % If the special character or escaped alphanumeric has a particular % meaning in regexes, the corresponding function is used. Otherwise, % it is interpreted as a raw character. We distinguish special % characters from escaped alphanumeric characters because they behave % differently when appearing as an end-point of a range. % \begin{macrocode} \cs_new_protected:Npn \@@_compile_special:N #1 { \cs_if_exist_use:cF { @@_compile_#1: } { \@@_compile_raw:N #1 } } \cs_new_protected:Npn \@@_compile_escaped:N #1 { \cs_if_exist_use:cF { @@_compile_/#1: } { \@@_compile_raw:N #1 } } % \end{macrocode} % \end{macro} % % \begin{macro}[int]{\@@_compile_one:x} % This is used after finding one \enquote{test}, such as |\d|, or a % raw character. If that followed a catcode test (\emph{e.g.}, |\cL|), % then restore the mode. If we are not in a class, then the test is % \enquote{standalone}, and we need to add \cs{@@_class:NnnnN} and % search for quantifiers. In any case, insert the test, possibly % together with a catcode test if appropriate. % \begin{macrocode} \cs_new_protected:Npn \@@_compile_one:x #1 { \@@_mode_quit_c: \@@_if_in_class:TF { } { \__tl_build_one:n { \@@_class:NnnnN \c_true_bool { \if_false: } \fi: } } \__tl_build_one:x { \if_int_compare:w \l_@@_catcodes_int < \c_@@_all_catcodes_int \@@_item_catcode:nT { \int_use:N \l_@@_catcodes_int } { \exp_not:N \exp_not:n {#1} } \else: \exp_not:N \exp_not:n {#1} \fi: } \int_set_eq:NN \l_@@_catcodes_int \l_@@_default_catcodes_int \@@_if_in_class:TF { } { \@@_compile_quantifier:w } } % \end{macrocode} % \end{macro} % % \begin{macro}[int] % {\@@_compile_abort_tokens:n, \@@_compile_abort_tokens:x} % This function places the collected tokens back in the input stream, % each as a raw character. Spaces are not preserved. % \begin{macrocode} \cs_new_protected:Npn \@@_compile_abort_tokens:n #1 { \use:x { \exp_args:No \tl_map_function:nN { \tl_to_str:n {#1} } \@@_compile_raw:N } } \cs_generate_variant:Nn \@@_compile_abort_tokens:n { x } % \end{macrocode} % \end{macro} % % \subsubsection{Quantifiers} % % \begin{macro}[int]{\@@_compile_quantifier:w} % This looks ahead and finds any quantifier (special character equal % to either of \texttt{?+*\{}). % \begin{macrocode} \cs_new_protected:Npn \@@_compile_quantifier:w #1#2 { \token_if_eq_meaning:NNTF #1 \@@_compile_special:N { \cs_if_exist_use:cF { @@_compile_quantifier_#2:w } { \@@_compile_quantifier_none: #1 #2 } } { \@@_compile_quantifier_none: #1 #2 } } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_compile_quantifier_none:} % \begin{macro}[aux]{\@@_compile_quantifier_abort:xNN} % Those functions are called whenever there is no quantifier, or a % braced construction is invalid (equivalent to no quantifier, and % whatever characters were grabbed are left raw). % \begin{macrocode} \cs_new_protected:Npn \@@_compile_quantifier_none: { \__tl_build_one:n { \if_false: { \fi: } { 1 } { 0 } \c_false_bool } } \cs_new_protected:Npn \@@_compile_quantifier_abort:xNN #1#2#3 { \@@_compile_quantifier_none: \__msg_kernel_warning:nnxx { regex } { invalid-quantifier } {#1} {#3} \@@_compile_abort_tokens:x {#1} #2 #3 } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}[aux]{\@@_compile_quantifier_lazyness:nnNN} % Once the \enquote{main} quantifier (\texttt{?}, \texttt{*}, % \texttt{+} or a braced construction) is found, we check whether it % is lazy (followed by a question mark). We then add to the compiled % regex a closing brace (ending \cs{@@_class:NnnnN} and friends), % the start-point of the range, its end-point, and a boolean, % \texttt{true} for lazy and \texttt{false} for greedy operators. % \begin{macrocode} \cs_new_protected:Npn \@@_compile_quantifier_lazyness:nnNN #1#2#3#4 { \str_if_eq:nnTF { #3 #4 } { \@@_compile_special:N ? } { \__tl_build_one:n { \if_false: { \fi: } { #1 } { #2 } \c_true_bool } } { \__tl_build_one:n { \if_false: { \fi: } { #1 } { #2 } \c_false_bool } #3 #4 } } % \end{macrocode} % \end{macro} % % \begin{macro}[aux] % { % \@@_compile_quantifier_?:w, % \@@_compile_quantifier_*:w, % \@@_compile_quantifier_+:w % } % For each \enquote{basic} quantifier, |?|, |*|, |+|, feed the correct % arguments to \cs{@@_compile_quantifier_lazyness:nnNN}, $-1$ means % that there is no upper bound on the number of repetitions. % \begin{macrocode} \cs_new_protected_nopar:cpn { @@_compile_quantifier_?:w } { \@@_compile_quantifier_lazyness:nnNN { 0 } { 1 } } \cs_new_protected_nopar:cpn { @@_compile_quantifier_*:w } { \@@_compile_quantifier_lazyness:nnNN { 0 } { -1 } } \cs_new_protected_nopar:cpn { @@_compile_quantifier_+:w } { \@@_compile_quantifier_lazyness:nnNN { 1 } { -1 } } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]+\@@_compile_quantifier_{:w+ ^^A} % \begin{macro}[aux] % { % \@@_compile_quantifier_braced_auxi:w, % \@@_compile_quantifier_braced_auxii:w, % \@@_compile_quantifier_braced_auxiii:w, % } % Three possible syntaxes: \texttt{\{\meta{int}\}}, % \texttt{\{\meta{int},\}}, or \texttt{\{\meta{int},\meta{int}\}}. Any % other syntax causes us to abort and put whatever we collected back % in the input stream, as \texttt{raw} characters, including the % opening brace. Grab a number into \cs{l_@@_internal_a_int}. If % the number is followed by a right brace, the range is $[a,a]$. If % followed by a comma, grab one more number, and call the \texttt{_ii} % or \texttt{_iii} auxiliary. Those auxiliaries check for a closing % brace, leading to the range $[a,\infty]$ or $[a,b]$, encoded as % $\{a\}\{-1\}$ and $\{a\}\{b-a\}$. % \begin{macrocode} \cs_new_protected:cpn { @@_compile_quantifier_ \c_left_brace_str :w } { \@@_get_digits:NTFw \l_@@_internal_a_int { \@@_compile_quantifier_braced_auxi:w } { \@@_compile_quantifier_abort:xNN { \c_left_brace_str } } } \cs_new_protected:Npn \@@_compile_quantifier_braced_auxi:w #1#2 { \str_case_x:nnn { #1 #2 } { { \@@_compile_special:N \c_right_brace_str } { \exp_args:No \@@_compile_quantifier_lazyness:nnNN { \int_use:N \l_@@_internal_a_int } { 0 } } { \@@_compile_special:N , } { \@@_get_digits:NTFw \l_@@_internal_b_int { \@@_compile_quantifier_braced_auxiii:w } { \@@_compile_quantifier_braced_auxii:w } } } { \@@_compile_quantifier_abort:xNN { \c_left_brace_str \int_use:N \l_@@_internal_a_int } #1 #2 } } \cs_new_protected:Npn \@@_compile_quantifier_braced_auxii:w #1#2 { \str_if_eq_x:nnTF { #1 #2 } { \@@_compile_special:N \c_right_brace_str } { \exp_args:No \@@_compile_quantifier_lazyness:nnNN { \int_use:N \l_@@_internal_a_int } { -1 } } { \@@_compile_quantifier_abort:xNN { \c_left_brace_str \int_use:N \l_@@_internal_a_int , } #1 #2 } } \cs_new_protected:Npn \@@_compile_quantifier_braced_auxiii:w #1#2 { \str_if_eq_x:nnTF { #1 #2 } { \@@_compile_special:N \c_right_brace_str } { \if_int_compare:w \l_@@_internal_a_int > \l_@@_internal_b_int \__msg_kernel_error:nnxx { regex } { backwards-quantifier } { \int_use:N \l_@@_internal_a_int } { \int_use:N \l_@@_internal_b_int } \int_zero:N \l_@@_internal_b_int \else: \int_sub:Nn \l_@@_internal_b_int \l_@@_internal_a_int \fi: \exp_args:Noo \@@_compile_quantifier_lazyness:nnNN { \int_use:N \l_@@_internal_a_int } { \int_use:N \l_@@_internal_b_int } } { \@@_compile_quantifier_abort:xNN { \c_left_brace_str \int_use:N \l_@@_internal_a_int , \int_use:N \l_@@_internal_b_int } #1 #2 } } % \end{macrocode} % \end{macro} % \end{macro} % % \subsubsection{Raw characters} % % \begin{macro}[int]{\@@_compile_raw_error:N} % Within character classes, and following catcode tests, some escaped % alphanumeric sequences such as |\b| do not have any meaning. They % are replaced by a raw character, after spitting out an error. % \begin{macrocode} \cs_new_protected:Npn \@@_compile_raw_error:N #1 { \__msg_kernel_error:nnx { regex } { bad-escape } {#1} \@@_compile_raw:N #1 } % \end{macrocode} % \end{macro} % % \begin{macro}[int]{\@@_compile_raw:N} % If we are in a character class and the next character is an % unescaped dash, this denotes a range. Otherwise, the current % character |#1| matches itself. % \begin{macrocode} \cs_new_protected:Npn \@@_compile_raw:N #1#2#3 { \@@_if_in_class:TF { \str_if_eq:nnTF {#2#3} { \@@_compile_special:N - } { \@@_compile_range:Nw #1 } { \@@_compile_one:x { \@@_item_equal:n { \__int_value:w `#1 ~ } } #2 #3 } } { \@@_compile_one:x { \@@_item_equal:n { \__int_value:w `#1 ~ } } #2 #3 } } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_compile_range:Nw, \@@_if_end_range:NNTF} % We have just read a raw character followed by a dash; this should be % followed by an end-point for the range. Valid end-points are: any % raw character; any special character, except a right bracket. In % particular, escaped characters are forbidden. % \begin{macrocode} \prg_new_protected_conditional:Npnn \@@_if_end_range:NN #1#2 { TF } { \if_meaning:w \@@_compile_raw:N #1 \prg_return_true: \else: \if_meaning:w \@@_compile_special:N #1 \if_charcode:w ] #2 \prg_return_false: \else: \prg_return_true: \fi: \else: \prg_return_false: \fi: \fi: } \cs_new_protected:Npn \@@_compile_range:Nw #1#2#3 { \@@_if_end_range:NNTF #2 #3 { \if_int_compare:w `#1 > `#3 \exp_stop_f: \__msg_kernel_error:nnxx { regex } { range-backwards } {#1} {#3} \else: \__tl_build_one:x { \if_int_compare:w `#1 = `#3 \exp_stop_f: \@@_item_equal:n \else: \@@_item_range:nn { \__int_value:w `#1 ~ } \fi: { \__int_value:w `#3 ~ } } \fi: } { \__msg_kernel_warning:nnxx { regex } { range-missing-end } {#1} { \c_backslash_str #3 } \__tl_build_one:x { \@@_item_equal:n { \__int_value:w `#1 ~ } \@@_item_equal:n { \__int_value:w `- ~ } } #2#3 } } % \end{macrocode} % \end{macro} % % \subsubsection{Character properties} % % \begin{macro}[aux]{\@@_compile_.:, \@@_prop_.:} % In a class, the dot has no special meaning. Outside, insert % \cs{@@_prop_.:}, which matches any character or control % sequence, and refuses $-2$ (end-marker). % \begin{macrocode} \cs_new_protected_nopar:cpx { @@_compile_.: } { \exp_not:N \@@_if_in_class:TF { \@@_compile_raw:N . } { \@@_compile_one:x \exp_not:c { @@_prop_.: } } } \cs_new_protected_nopar:cpn { @@_prop_.: } { \if_int_compare:w \l_@@_current_char_int > - \c_two \exp_after:wN \@@_break_true:w \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}[aux] % { % \@@_compile_/d:, \@@_compile_/D:, % \@@_compile_/h:, \@@_compile_/H:, % \@@_compile_/s:, \@@_compile_/S:, % \@@_compile_/v:, \@@_compile_/V:, % \@@_compile_/w:, \@@_compile_/W:, % \@@_compile_/N:, % } % The constants \cs{@@_prop_d:}, \emph{etc.} hold % a list of tests which match the corresponding character % class, and jump to the \cs{@@_break_point:TF} marker. % As for a normal character, we check for quantifiers. % \begin{macrocode} \cs_set_protected:Npn \@@_tmp:w #1#2 { \cs_new_protected_nopar:cpx { @@_compile_/#1: } { \@@_compile_one:x \exp_not:c { @@_prop_#1: } } \cs_new_protected_nopar:cpx { @@_compile_/#2: } { \@@_compile_one:x { \@@_item_reverse:n \exp_not:c { @@_prop_#1: } } } } \@@_tmp:w d D \@@_tmp:w h H \@@_tmp:w s S \@@_tmp:w v V \@@_tmp:w w W \cs_new_protected_nopar:cpn { @@_compile_/N: } { \@@_compile_one:x \@@_prop_N: } % \end{macrocode} % \end{macro} % % \subsubsection{Anchoring and simple assertions} % % \begin{macro}[aux]{\@@_compile_anchor:NF} % \begin{macro}[aux]+\@@_compile_^:+ % \begin{macro}[aux]{\@@_compile_/A:, \@@_compile_/G:} % \begin{macro}[aux]+\@@_compile_$:+ % \begin{macro}[aux]{\@@_compile_/Z:, \@@_compile_/z:} % In modes where assertions are allowed, anchor to the start of the % query, the start of the match, or the end of the query, depending on % the integer |#1|. In other modes, |#2| treats the character as raw, % with an error for escaped letters (|$| is valid in a class, but |\A| % is definitely a mistake on the user's part). % \begin{macrocode} \cs_new_protected:Npn \@@_compile_anchor:NF #1#2 { \@@_if_in_class_or_catcode:TF {#2} { \__tl_build_one:n { \@@_assertion:Nn \c_true_bool { \@@_anchor:N #1 } } } } \cs_set_protected:Npn \@@_tmp:w #1#2 { \cs_new_protected_nopar:cpn { @@_compile_/#1: } { \@@_compile_anchor:NF #2 { \@@_compile_raw_error:N #1 } } } \@@_tmp:w A \l_@@_min_pos_int \@@_tmp:w G \l_@@_start_pos_int \@@_tmp:w Z \l_@@_max_pos_int \@@_tmp:w z \l_@@_max_pos_int \cs_set_protected:Npn \@@_tmp:w #1#2 { \cs_new_protected_nopar:cpn { @@_compile_#1: } { \@@_compile_anchor:NF #2 { \@@_compile_raw:N #1 } } } \exp_args:Nx \@@_tmp:w { \iow_char:N \^ } \l_@@_min_pos_int \exp_args:Nx \@@_tmp:w { \iow_char:N \$ } \l_@@_max_pos_int % \end{macrocode} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % % \begin{macro}[aux]{\@@_compile_/b:, \@@_compile_/B:} % Contrarily to |^| and |$|, which could be implemented without really % knowing what precedes in the token list, this requires more % information, namely, the knowledge of the last character code. % \begin{macrocode} \cs_new_protected_nopar:cpn { @@_compile_/b: } { \@@_if_in_class_or_catcode:TF { \@@_compile_raw_error:N b } { \__tl_build_one:n { \@@_assertion:Nn \c_true_bool { \@@_b_test: } } } } \cs_new_protected_nopar:cpn { @@_compile_/B: } { \@@_if_in_class_or_catcode:TF { \@@_compile_raw_error:N B } { \__tl_build_one:n { \@@_assertion:Nn \c_false_bool { \@@_b_test: } } } } % \end{macrocode} % \end{macro} % % \subsubsection{Character classes} % % \begin{macro}[aux]{\@@_compile_]:} % Outside a class, right brackets have no meaning. In a class, change % the mode ($m\to (m-15)/13$, truncated) to reflect the fact that we % are leaving the class. Look for quantifiers, unless we are still in % a class after leaving one (the case of |[...\cL[...]...]|). % quantifiers. % \begin{macrocode} \cs_new_protected:cpn { @@_compile_]: } { \@@_if_in_class:TF { \if_int_compare:w \l_@@_mode_int > \c_sixteen \__tl_build_one:n { \if_false: { \fi: } } \fi: \tex_advance:D \l_@@_mode_int - \c_fifteen \tex_divide:D \l_@@_mode_int \c_thirteen \if_int_odd:w \l_@@_mode_int \else: \exp_after:wN \@@_compile_quantifier:w \fi: } { \@@_compile_raw:N ] } } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_compile_[:} % In a class, left brackets might introduce a \textsc{posix} character % class, or mean nothing. Immediately following |\c|\meta{category}, % we must insert the appropriate catcode test, then parse the class; we % pre-expand the catcode as an optimization. Otherwise (modes $0$, % $-2$ and $-6$) just parse the class. The mode is updated later. % \begin{macrocode} \cs_new_protected_nopar:cpn { @@_compile_[: } { \@@_if_in_class:TF { \@@_compile_class_posix_test:w } { \@@_if_within_catcode:TF { \exp_after:wN \@@_compile_class_catcode:w \int_use:N \l_@@_catcodes_int ; } { \@@_compile_class_normal:w } } } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_compile_class_normal:w} % In the \enquote{normal} case, we will insert \cs{@@_class:NnnnN} % \meta{boolean} in the compiled code. The \meta{boolean} is true for % positive classes, and false for negative classes, characterized by a % leading |^|. The auxiliary \cs{@@_compile_class:TFNN} also % checks for a leading |]| which has a special meaning. % \begin{macrocode} \cs_new_protected_nopar:Npn \@@_compile_class_normal:w { \@@_compile_class:TFNN { \@@_class:NnnnN \c_true_bool } { \@@_class:NnnnN \c_false_bool } } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_compile_class_catcode:w} % This function is called for a left bracket in modes $2$ or $6$ % (catcode test, and catcode test within a class). In mode $2$ the % whole construction needs to be put in a class (like single % character). Then determine if the class is positive or negative, % inserting \cs{@@_item_catcode:nT} or the \texttt{reverse} variant % as appropriate, each with the current catcodes bitmap |#1| as an % argument, and reset the catcodes. % \begin{macrocode} \cs_new_protected:Npn \@@_compile_class_catcode:w #1; { \if_int_compare:w \l_@@_mode_int = \c_two \__tl_build_one:n { \@@_class:NnnnN \c_true_bool { \if_false: } \fi: } \fi: \int_set_eq:NN \l_@@_catcodes_int \l_@@_default_catcodes_int \@@_compile_class:TFNN { \@@_item_catcode:nT {#1} } { \@@_item_catcode_reverse:nT {#1} } } % \end{macrocode} % \end{macro} % % \begin{macro}[aux] % {\@@_compile_class:TFNN, \@@_compile_class:NN} % If the first character is |^|, then the class is negative (use % |#2|), otherwise it is positive (use |#1|). If the next character % is a right bracket, then it should be changed to a raw one. % \begin{macrocode} \cs_new_protected:Npn \@@_compile_class:TFNN #1#2#3#4 { \l_@@_mode_int = \__int_value:w \l_@@_mode_int 3 \exp_stop_f: \str_if_eq:nnTF { #3 #4 } { \@@_compile_special:N ^ } { \__tl_build_one:n { #2 { \if_false: } \fi: } \@@_compile_class:NN } { \__tl_build_one:n { #1 { \if_false: } \fi: } \@@_compile_class:NN #3 #4 } } \cs_new_protected:Npn \@@_compile_class:NN #1#2 { \token_if_eq_charcode:NNTF #2 ] { \@@_compile_raw:N #2 } { #1 #2 } } % \end{macrocode} % \end{macro} % % \begin{macro}[aux] % { % \@@_compile_class_posix_test:w, % \@@_compile_class_posix:NNNNw, % \@@_compile_class_posix_loop:w, % \@@_compile_class_posix_end:w % } % Here we check for a syntax such as |[:alpha:]|. We also detect |[=| % and |[.| which have a meaning in \textsc{posix} regular expressions, % but are not implemented in \pkg{l3regex}. In case we see |[:|, grab % raw characters until hopefully reaching |:]|. If that's missing, or % the \textsc{posix} class is unknown, abort. If all is right, add the % test to the current class, with an extra \cs{@@_item_reverse:n} % for negative classes. % \begin{macrocode} \cs_new_protected:Npn \@@_compile_class_posix_test:w #1#2 { \token_if_eq_meaning:NNT \@@_compile_special:N #1 { \str_case:nn { #2 } { : { \@@_compile_class_posix:NNNNw } = { \__msg_kernel_warning:nnx { regex } { posix-unsupported } { = } } . { \__msg_kernel_warning:nnx { regex } { posix-unsupported } { . } } } } \@@_compile_raw:N [ #1 #2 } \cs_new_protected:Npn \@@_compile_class_posix:NNNNw #1#2#3#4#5#6 { \str_if_eq:nnTF { #5 #6 } { \@@_compile_special:N ^ } { \bool_set_false:N \l_@@_internal_bool \tl_set:Nx \l_@@_internal_a_tl { \if_false: } \fi: \@@_compile_class_posix_loop:w } { \bool_set_true:N \l_@@_internal_bool \tl_set:Nx \l_@@_internal_a_tl { \if_false: } \fi: \@@_compile_class_posix_loop:w #5 #6 } } \cs_new:Npn \@@_compile_class_posix_loop:w #1#2 { \token_if_eq_meaning:NNTF \@@_compile_raw:N #1 { #2 \@@_compile_class_posix_loop:w } { \if_false: { \fi: } \@@_compile_class_posix_end:w #1 #2 } } \cs_new_protected:Npn \@@_compile_class_posix_end:w #1#2#3#4 { \str_if_eq:nnTF { #1 #2 #3 #4 } { \@@_compile_special:N : \@@_compile_special:N ] } { \cs_if_exist:cTF { @@_posix_ \l_@@_internal_a_tl : } { \@@_compile_one:x { \bool_if:NF \l_@@_internal_bool \@@_item_reverse:n \exp_not:c { @@_posix_ \l_@@_internal_a_tl : } } } { \__msg_kernel_warning:nnx { regex } { posix-unknown } { \l_@@_internal_a_tl } \@@_compile_abort_tokens:x { [: \bool_if:NF \l_@@_internal_bool { ^ } \l_@@_internal_a_tl :] } } } { \__msg_kernel_error:nnxx { regex } { posix-missing-close } { [: \l_@@_internal_a_tl } { #2 #4 } \@@_compile_abort_tokens:x { [: \l_@@_internal_a_tl } #1 #2 #3 #4 } } % \end{macrocode} % \end{macro} % % \subsubsection{Groups and alternations} % % \begin{macro}[aux]{\@@_compile_group_begin:N, \@@_compile_group_end:} % The contents of a regex group are turned into compiled code in % \cs{l_@@_internal_regex}, which ends up with items of the form % \cs{@@_branch:n} \Arg{concatenation}. This construction is done % using \pkg{l3tl-build} within a \TeX{} group, which automatically % makes sure that options (case-sensitivity and default catcode) are % reset at the end of the group. The argument |#1| is % \cs{@@_group:nnnN} or a variant thereof. A small subtlety to % support |\cL(abc)| as a shorthand for |(\cLa\cLb\cLc)|: exit any % pending catcode test, save the category code at the start of the % group as the default catcode for that group, and make sure that the % catcode is restored to the default outside the group. % \begin{macrocode} \cs_new_protected:Npn \@@_compile_group_begin:N #1 { \__tl_build_one:n { #1 { \if_false: } \fi: } \@@_mode_quit_c: \__tl_build:Nw \l_@@_internal_regex \int_set_eq:NN \l_@@_default_catcodes_int \l_@@_catcodes_int \int_incr:N \l_@@_group_level_int \__tl_build_one:n { \@@_branch:n { \if_false: } \fi: } } \cs_new_protected:Npn \@@_compile_group_end: { \if_int_compare:w \l_@@_group_level_int > \c_zero \__tl_build_one:n { \if_false: { \fi: } } \__tl_build_end: \int_set_eq:NN \l_@@_catcodes_int \l_@@_default_catcodes_int \__tl_build_one:o \l_@@_internal_regex \exp_after:wN \@@_compile_quantifier:w \else: \__msg_kernel_warning:nn { regex } { extra-rparen } \exp_after:wN \@@_compile_raw:N \exp_after:wN ) \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_compile_(:} % In a class, parentheses are not special. Outside, check for a |?|, % denoting special groups, and run the code for the corresponding % special group. % \begin{macrocode} \cs_new_protected_nopar:cpn { @@_compile_(: } { \@@_if_in_class:TF { \@@_compile_raw:N ( } { \@@_compile_lparen:w } } \cs_new_protected:Npn \@@_compile_lparen:w #1#2#3#4 { \str_if_eq:nnTF { #1 #2 } { \@@_compile_special:N ? } { \cs_if_exist_use:cF { @@_compile_special_group_\token_to_str:N #4 :w } { \__msg_kernel_warning:nnx { regex } { special-group-unknown } { (? #4 } \@@_compile_group_begin:N \@@_group:nnnN \@@_compile_raw:N ? #3 #4 } } { \@@_compile_group_begin:N \@@_group:nnnN #1 #2 #3 #4 } } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]+\@@_compile_|:+ % In a class, the pipe is not special. Otherwise, end the current % branch and open another one. % \begin{macrocode} \cs_new_protected_nopar:cpn { @@_compile_|: } { \@@_if_in_class:TF { \@@_compile_raw:N | } { \__tl_build_one:n { \if_false: { \fi: } \@@_branch:n { \if_false: } \fi: } } } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_compile_):} % Within a class, parentheses are not special. Outside, close a group. % \begin{macrocode} \cs_new_protected_nopar:cpn { @@_compile_): } { \@@_if_in_class:TF { \@@_compile_raw:N ) } { \@@_compile_group_end: } } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_compile_special_group_::w} % \begin{macro}[aux]+\@@_compile_special_group_|:w+ % Non-capturing, and resetting groups are easy to take care of during % compilation; for those groups, the harder parts will come when % building. % \begin{macrocode} \cs_new_protected_nopar:cpn { @@_compile_special_group_::w } { \@@_compile_group_begin:N \@@_group_no_capture:nnnN } \cs_new_protected_nopar:cpn { @@_compile_special_group_|:w } { \@@_compile_group_begin:N \@@_group_resetting:nnnN } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}[aux] % {\@@_compile_special_group_i:w, \@@_compile_special_group_-:w} % The match can be made case-insensitive by setting the option with % \texttt{(?i)}; the original behaviour is restored by \texttt{(?-i)}. % This is the only supported option. % \begin{macrocode} \cs_new_protected:Npn \@@_compile_special_group_i:w #1#2 { \str_if_eq:nnTF { #1 #2 } { \@@_compile_special:N ) } { \cs_set_nopar:Npn \@@_item_equal:n { \@@_item_caseless_equal:n } \cs_set_nopar:Npn \@@_item_range:nn { \@@_item_caseless_range:nn } } { \__msg_kernel_warning:nnx { regex } { unknown-option } { (?i #2 } \@@_compile_raw:N ( \@@_compile_raw:N ? \@@_compile_raw:N i #1 #2 } } \cs_new_protected_nopar:cpn { @@_compile_special_group_-:w } #1#2#3#4 { \str_if_eq:nnTF { #1 #2 #3 #4 } { \@@_compile_raw:N i \@@_compile_special:N ) } { \cs_set_nopar:Npn \@@_item_equal:n { \@@_item_caseful_equal:n } \cs_set_nopar:Npn \@@_item_range:nn { \@@_item_caseful_range:nn } } { \__msg_kernel_warning:nnx { regex } { unknown-option } { (?-#2#4 } \@@_compile_raw:N ( \@@_compile_raw:N ? \@@_compile_raw:N - #1 #2 #3 #4 } } % \end{macrocode} % \end{macro} % % \subsubsection{Catcodes and csnames} % % \begin{macro}[aux]{\@@_compile_/c:, \@@_compile_c_test:NN} % The |\c| escape sequence can be followed by a capital letter % representing a character category, by a left bracket which starts a % list of categories, or by a brace group holding a regular expression % for a control sequence name. Otherwise, raise an error. % \begin{macrocode} \cs_new_protected:cpn { @@_compile_/c: } { \@@_chk_c_allowed:T { \@@_compile_c_test:NN } } \cs_new_protected:Npn \@@_compile_c_test:NN #1#2 { \token_if_eq_meaning:NNTF #1 \@@_compile_raw:N { \int_if_exist:cTF { c_@@_catcode_#2_int } { \int_set_eq:Nc \l_@@_catcodes_int { c_@@_catcode_#2_int } \l_@@_mode_int = \if_case:w \l_@@_mode_int \c_two \else: \c_six \fi: } } { \cs_if_exist_use:cF { @@_compile_c_#2:w } } { \__msg_kernel_error:nnx { regex } { c-missing-category } {#2} #1 #2 } } % \end{macrocode} % \end{macro} % % \begin{macro}[aux] % { % \@@_compile_c_[:w, % \@@_compile_c_lbrack_loop:NN, % \@@_compile_c_lbrack_add:N, % \@@_compile_c_lbrack_end:, % } % When encountering |\c[|, the task is to collect uppercase letters % representing character categories. First check for |^| which negates % the list of category codes. % \begin{macrocode} \cs_new_protected:cpn { @@_compile_c_[:w } #1#2 { \l_@@_mode_int = \if_case:w \l_@@_mode_int \c_two \else: \c_six \fi: \int_zero:N \l_@@_catcodes_int \str_if_eq:nnTF { #1 #2 } { \@@_compile_special:N ^ } { \bool_set_false:N \l_@@_catcodes_bool \@@_compile_c_lbrack_loop:NN } { \bool_set_true:N \l_@@_catcodes_bool \@@_compile_c_lbrack_loop:NN #1 #2 } } \cs_new_protected:Npn \@@_compile_c_lbrack_loop:NN #1#2 { \token_if_eq_meaning:NNTF #1 \@@_compile_raw:N { \int_if_exist:cTF { c_@@_catcode_#2_int } { \exp_args:Nc \@@_compile_c_lbrack_add:N { c_@@_catcode_#2_int } \@@_compile_c_lbrack_loop:NN } } { \token_if_eq_charcode:NNTF #2 ] { \@@_compile_c_lbrack_end: } } { \__msg_kernel_error:nnx { regex } { c-missing-rbrack } {#2} \@@_compile_c_lbrack_end: #1 #2 } } \cs_new_protected:Npn \@@_compile_c_lbrack_add:N #1 { \if_int_odd:w \__int_eval:w \l_@@_catcodes_int / #1 \__int_eval_end: \else: \tex_advance:D \l_@@_catcodes_int #1 \fi: } \cs_new_protected_nopar:Npn \@@_compile_c_lbrack_end: { \if_meaning:w \c_false_bool \l_@@_catcodes_bool \int_set:Nn \l_@@_catcodes_int { \c_@@_all_catcodes_int - \l_@@_catcodes_int } \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}+\@@_compile_c_{:+ % The case of a left brace is easy, based on what we have done so far: % in a group, compile the regular expression, after changing the mode % to forbid nesting |\c|. Additionally, disable submatch tracking % since groups don't escape the scope of |\c{...}|. % \begin{macrocode} \cs_new_protected_nopar:cpn { @@_compile_c_ \c_left_brace_str :w } { \@@_compile:w \@@_disable_submatches: \l_@@_mode_int = - \if_case:w \l_@@_mode_int \c_two \else: \c_six \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}+\@@_compile_}:+ % Non-escaped right braces are only special if they appear when % compiling the regular expression for a csname, but not within a % class: |\c{[}{]}| matches the control sequences |\}| and % |\{|\ldots{} Admittedly, that would be better done as % |\c{[{}]}|. So, end compiling the inner regex (this closes any % dangling class or group). Then insert the corresponding test in the % outer regex. % \begin{macrocode} \cs_new_protected:cpn { @@_compile_ \c_right_brace_str : } { \@@_if_in_cs:TF { \@@_compile_end: \@@_compile_one:x { \@@_item_cs:n { \exp_not:o \l_@@_internal_regex } } } { \exp_after:wN \@@_compile_raw:N \c_right_brace_str } } % \end{macrocode} % \end{macro} % % \subsubsection{Raw token lists with \cs{u}} % % \begin{macro}[aux]{\@@_compile_/u:} % \begin{macro}[aux, EXP]{\@@_compile_u_loop:NN} % The |\u| escape is invalid in classes and directly following a % catcode test. Otherwise, it must be followed by a left brace. We % then collect the characters for the argument of |\u| within an % \texttt{x}-expanding assignment. In principle we could just wait to % encounter a right brace, but this is unsafe: if the right brace is % missing, then we will reach the end-markers of the regex, and % continue, leading to obscure fatal errors. Instead, we only allow % raw and special characters, and stop when encountering a special % right brace, any escaped character, or the end-marker. % \begin{macrocode} \cs_new_protected:cpn { @@_compile_/u: } #1#2 { \@@_if_in_class_or_catcode:TF { \@@_compile_raw_error:N u #1 #2 } { \str_if_eq_x:nnTF {#1#2} { \@@_compile_special:N \c_left_brace_str } { \tl_set:Nx \l_@@_internal_a_tl { \if_false: } \fi: \@@_compile_u_loop:NN } { \__msg_kernel_error:nn { regex } { u-missing-lbrace } \@@_compile_raw:N u #1 #2 } } } \cs_new:Npn \@@_compile_u_loop:NN #1#2 { \token_if_eq_meaning:NNTF #1 \@@_compile_raw:N { #2 \@@_compile_u_loop:NN } { \token_if_eq_meaning:NNTF #1 \@@_compile_special:N { \exp_after:wN \token_if_eq_charcode:NNTF \c_right_brace_str #2 { \if_false: { \fi: } \@@_compile_u_end: } { #2 \@@_compile_u_loop:NN } } { \if_false: { \fi: } \__msg_kernel_error:nnx { regex } { u-missing-rbrace } {#2} \@@_compile_u_end: #1 #2 } } } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}[aux]{\@@_compile_u_end:} % Once we have extracted the variable's name, we store the contents of % that variable in \cs{l_@@_internal_a_tl}. The behaviour of |\u| % then depends on whether we are within a |\c{...}| escape (in this % case, the variable is turned to a string), or not. % \begin{macrocode} \cs_new_protected:Npn \@@_compile_u_end: { \tl_set:Nv \l_@@_internal_a_tl { \l_@@_internal_a_tl } \if_int_compare:w \l_@@_mode_int = \c_zero \@@_compile_u_not_cs: \else: \@@_compile_u_in_cs: \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_compile_u_in_cs:} % When |\u| appears within a control sequence, we convert the variable % to a string with escaped spaces. Then for each character insert a % class matching exactly that character, once. % \begin{macrocode} \cs_new_protected:Npn \@@_compile_u_in_cs: { \exp_args:NNo \__str_gset_other:Nn \g_@@_internal_tl { \l_@@_internal_a_tl } \__tl_build_one:x { \tl_map_function:NN \g_@@_internal_tl \@@_compile_u_in_cs_aux:n } } \cs_new:Npn \@@_compile_u_in_cs_aux:n #1 { \@@_class:NnnnN \c_true_bool { \@@_item_caseful_equal:n { \__int_value:w `#1 } } { 1 } { 0 } \c_false_bool } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_compile_u_not_cs:} % In mode $0$, the |\u| escape adds one state to the NFA for each % token in \cs{l_@@_internal_a_tl}. If a given \meta{token} is a % control sequence, then insert a string comparison test, otherwise, % \cs{@@_item_exact:nn} which compares catcode and character code. % \begin{macrocode} \cs_new_protected:Npn \@@_compile_u_not_cs: { \exp_args:No \__tl_analysis_map_inline:nn { \l_@@_internal_a_tl } { \__tl_build_one:n { \@@_class:NnnnN \c_true_bool { \if_int_compare:w "##2 = \c_zero \@@_item_exact_cs:c { \exp_after:wN \cs_to_str:N ##1 } \else: \@@_item_exact:nn { \__int_value:w "##2 } { ##3 } \fi: } { 1 } { 0 } \c_false_bool } } } % \end{macrocode} % \end{macro} % % \subsubsection{Other} % % \begin{macro}[aux]{\@@_compile_/K:} % The |\K| control sequence is currently the only \enquote{command}, % which performs some action, rather than matching something. It is % allowed in the same contexts as |\b|. At the compilation stage, we % leave it as a single control sequence, defined later. % \begin{macrocode} \cs_new_protected_nopar:cpn { @@_compile_/K: } { \int_compare:nNnTF \l_@@_mode_int = \c_zero { \__tl_build_one:n { \@@_command_K: } } { \@@_compile_raw_error:N K } } % \end{macrocode} % \end{macro} % % \subsubsection{Showing regexes} % % \begin{macro}[aux]{\@@_show:Nx} % Within a \cs{__tl_build:Nw} \ldots{} \cs{__tl_build_end:} group, we % redefine all the function that can appear in a compiled regex, then % run the regex. The result is then shown. % \begin{macrocode} \cs_new_protected:Npn \@@_show:Nx #1#2 { \__tl_build:Nw \l_@@_internal_a_tl \cs_set_protected_nopar:Npn \@@_branch:n { \seq_pop_right:NN \l_@@_show_prefix_seq \l_@@_internal_a_tl \@@_show_one:n { +-branch } \seq_put_right:No \l_@@_show_prefix_seq \l_@@_internal_a_tl \use:n } \cs_set_protected_nopar:Npn \@@_group:nnnN { \@@_show_group_aux:nnnnN { } } \cs_set_protected_nopar:Npn \@@_group_no_capture:nnnN { \@@_show_group_aux:nnnnN { ~(no~capture) } } \cs_set_protected_nopar:Npn \@@_group_resetting:nnnN { \@@_show_group_aux:nnnnN { ~(resetting) } } \cs_set_eq:NN \@@_class:NnnnN \@@_show_class:NnnnN \cs_set_protected_nopar:Npn \@@_command_K: { \@@_show_one:n { reset~match~start~(\iow_char:N\\K) } } \cs_set_protected:Npn \@@_assertion:Nn ##1##2 { \@@_show_one:n { \bool_if:NF ##1 { negative~ } assertion:~##2 } } \cs_set_nopar:Npn \@@_b_test: { word~boundary } \cs_set_eq:NN \@@_anchor:N \@@_show_anchor_to_str:N \cs_set_protected:Npn \@@_item_caseful_equal:n ##1 { \@@_show_one:n { char~code~\int_eval:n{##1} } } \cs_set_protected:Npn \@@_item_caseful_range:nn ##1##2 { \@@_show_one:n { range~[\int_eval:n{##1}, \int_eval:n{##2}] } } \cs_set_protected:Npn \@@_item_caseless_equal:n ##1 { \@@_show_one:n { char~code~\int_eval:n{##1}~(caseless) } } \cs_set_protected:Npn \@@_item_caseless_range:nn ##1##2 { \@@_show_one:n { Range~[\int_eval:n{##1}, \int_eval:n{##2}]~(caseless) } } \cs_set_protected:Npn \@@_item_catcode:nT { \@@_show_item_catcode:NnT \c_true_bool } \cs_set_protected:Npn \@@_item_catcode_reverse:nT { \@@_show_item_catcode:NnT \c_false_bool } \cs_set_protected:Npn \@@_item_reverse:n { \@@_show_scope:nn { Reversed~match } } \cs_set_protected:Npn \@@_item_exact:nn ##1##2 { \@@_show_one:n { char~##2,~catcode~##1 } } \cs_set_protected:Npn \@@_item_exact_cs:c ##1 { \@@_show_one:n { control~sequence~\iow_char:N\\##1 } } \cs_set_protected:Npn \@@_item_cs:n { \@@_show_scope:nn { control~sequence } } \cs_set:cpn { @@_prop_.: } { \@@_show_one:n { any~token } } \seq_clear:N \l_@@_show_prefix_seq \@@_show_push:n { ~ } #1 \__tl_build_end: \__msg_show_variable:n { >~Compiled~regex~#2: \l_@@_internal_a_tl } } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_show_one:n} % Every part of the final message go through this function, which adds % one line to the output, with the appropriate prefix. % \begin{macrocode} \cs_new_protected:Npn \@@_show_one:n #1 { \int_incr:N \l_@@_show_lines_int \__tl_build_one:x { \exp_not:N \\ \seq_map_function:NN \l_@@_show_prefix_seq \use:n #1 } } % \end{macrocode} % \end{macro} % % \begin{macro}[aux] % {\@@_show_push:n, \@@_show_pop:, \@@_show_scope:nn} % Enter and exit levels of nesting. The \texttt{scope} function prints % its first argument as an \enquote{introduction}, then performs its % second argument in a deeper level of nesting. % \begin{macrocode} \cs_new_protected:Npn \@@_show_push:n #1 { \seq_put_right:Nx \l_@@_show_prefix_seq { #1 ~ } } \cs_new_protected:Npn \@@_show_pop: { \seq_pop_right:NN \l_@@_show_prefix_seq \l_@@_internal_a_tl } \cs_new_protected:Npn \@@_show_scope:nn #1#2 { \@@_show_one:n {#1} \@@_show_push:n { ~ } #2 \@@_show_pop: } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_show_group_aux:nnnnN} % We display all groups in the same way, simply adding a message, % \texttt{(no capture)} or \texttt{(resetting)}, to special groups. % The odd \cs{use_ii:nn} avoids printing a spurious \texttt{+-branch} % for the first branch. % \begin{macrocode} \cs_new_protected:Npn \@@_show_group_aux:nnnnN #1#2#3#4#5 { \@@_show_one:n { ,-group~begin #1 } \@@_show_push:n { | } \use_ii:nn #2 \@@_show_pop: \@@_show_one:n { `-group~end \@@_msg_repeated:nnN {#3} {#4} #5 } } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_show_class:NnnnN} % I'm entirely unhappy about this function: I couldn't find a way to % test if a class is a single test. Instead, collect the % representation of the tests in the class. If that had more than one % line, write \texttt{Match} or \texttt{Don't match} on its own line, % with the repeating information if any. Then the various tests on % lines of their own, and finally a line. Otherwise, we need to % evaluate the representation of the tests again (since the prefix is % incorrect). That's clunky, but not too expensive, since it's only % one test. % \begin{macrocode} \cs_set:Npn \@@_show_class:NnnnN #1#2#3#4#5 { \__tl_build:Nw \l_@@_internal_a_tl \int_zero:N \l_@@_show_lines_int \@@_show_push:n {~} #2 \exp_last_unbraced:Nf \int_case:nnF { \l_@@_show_lines_int } { {0} { \__tl_build_end: \@@_show_one:n { \bool_if:NTF #1 { Fail } { Pass } } } {1} { \__tl_build_end: \bool_if:NTF #1 { #2 \__tl_build_one:n { \@@_msg_repeated:nnN {#3} {#4} #5 } } { \@@_show_one:n { Don't~match~\@@_msg_repeated:nnN {#3} {#4} #5 } \__tl_build_one:o \l_@@_internal_a_tl } } } { \__tl_build_end: \@@_show_one:n { \bool_if:NTF #1 { M } { Don't~m } atch \@@_msg_repeated:nnN {#3} {#4} #5 } \__tl_build_one:o \l_@@_internal_a_tl } } % \end{macrocode} % \end{macro} % % \begin{macro}[aux, rEXP]{\@@_show_anchor_to_str:N} % The argument is an integer telling us where the anchor is. We % convert that to the relevant info. % \begin{macrocode} \cs_new:Npn \@@_show_anchor_to_str:N #1 { anchor~at~ \str_case:nnF { #1 } { { \l_@@_min_pos_int } { start~(\iow_char:N\\A) } { \l_@@_start_pos_int } { start~of~match~(\iow_char:N\\G) } { \l_@@_max_pos_int } { end~(\iow_char:N\\Z) } } { } } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_show_item_catcode:NnT} % Produce a sequence of categories which the catcode bitmap |#2| % contains, and show it, indenting the tests on which this catcode % constraint applies. % \begin{macrocode} \cs_new_protected:Npn \@@_show_item_catcode:NnT #1#2 { \seq_set_split:Nnn \l_@@_internal_seq { } { CBEMTPUDSLOA } \seq_set_filter:NNn \l_@@_internal_seq \l_@@_internal_seq { \int_if_odd_p:n { #2 / \int_use:c { c_@@_catcode_##1_int } } } \@@_show_scope:nn { categories~ \seq_map_function:NN \l_@@_internal_seq \use:n , ~ \bool_if:NF #1 { negative~ } class } } % \end{macrocode} % \end{macro} % % \subsection{Building} % % \subsubsection{Variables used while building} % % \begin{variable}{\l_@@_min_state_int, \l_@@_max_state_int} % The last state that was allocated is $\cs{l_@@_max_state_int}-1$, % so that \cs{l_@@_max_state_int} always points to a free state. % The \texttt{min_state} variable is always $0$, but is included to % avoid hard-coding this value. % \begin{macrocode} \int_new:N \l_@@_min_state_int \int_new:N \l_@@_max_state_int % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_left_state_int, \l_@@_right_state_int} % \begin{variable}{\l_@@_left_state_seq, \l_@@_right_state_seq} % Alternatives are implemented by branching from a \texttt{left} state % into the various choices, then merging those into a \texttt{right} % state. We store information about those states in two sequences. % Those states are also used to implement group quantifiers. Most % often, the left and right pointers only differ by~$1$. % \begin{macrocode} \int_new:N \l_@@_left_state_int \int_new:N \l_@@_right_state_int \seq_new:N \l_@@_left_state_seq \seq_new:N \l_@@_right_state_seq % \end{macrocode} % \end{variable} % \end{variable} % % \begin{variable}{\l_@@_capturing_group_int} % \cs{l_@@_capturing_group_int} is the \textsc{id} number that will % be assigned to a capturing group if one was opened now. This starts % at $0$ for the group enclosing the full regular expression, and % groups are counted in the order of their left parenthesis, except % when encountering \texttt{resetting} groups. % \begin{macrocode} \int_new:N \l_@@_capturing_group_int % \end{macrocode} % \end{variable} % % \subsubsection{Framework} % % This phase is about going from a compiled regex to an \textsc{nfa}. % Each state of the \textsc{nfa} is stored in a \tn{toks}. The % operations which can appear in the \tn{toks} are % \begin{itemize} % \item \cs{@@_action_start_wildcard:} inserted at the start % of the regular expression to make it unanchored. % \item \cs{@@_action_success:} marks the exit state of the % \textsc{nfa}. % \item \cs{@@_action_cost:n} \Arg{shift} is a transition from the % current \meta{state} to $\meta{state}+\meta{shift}$, which % consumes the current character: the target state is saved and will % be considered again when matching at the next position. % \item \cs{@@_action_free:n} \Arg{shift}, and % \cs{@@_action_free_group:n} \Arg{shift} are free transitions, % which immediately perform the actions for the state % $\meta{state}+\meta{shift}$ of the \textsc{nfa}. They differ in % how they detect and avoid infinite loops. For now, we just need to % know that the \texttt{group} variant must be used for transitions % back to the start of a group. % \item \cs{@@_action_submatch:n} \Arg{key} where the \meta{key} is % a group number followed by |<| or |>| for the beginning or end of % group. This causes the current position in the query to be stored % as the \meta{key} submatch boundary. % \end{itemize} % % We strive to preserve the following properties while building. % \begin{itemize} % \item The current capturing group is % $\text{\texttt{capturing_group}}-1$, and if a group is opened now, % it will be labelled \texttt{capturing_group}. % \item The last allocated state is $\text{\texttt{max_state}}-1$, so % \texttt{max_state} is a free state. % \item The \texttt{left_state} points to a state to the left of the % current group or of the last class. % \item The \texttt{right_state} points to a newly created, % empty state, with some transitions leading to it. % \item The \texttt{left/right} sequences hold a list of the % corresponding end-points of nested groups. % \end{itemize} % % \begin{macro}[int]{\@@_build:n, \@@_build:N} % The \texttt{n}-type function first compiles its argument. Reset some % variables. Allocate two states, and put a wildcard in state $0$ % (transitions to state $1$ and $0$ state). Then build the regex % within a (capturing) group, which will be numbered $0$ (current % value of \texttt{capturing_group}). Finally, if the match reaches the % last state, it is successful. % \begin{macrocode} \cs_new_protected:Npn \@@_build:n #1 { \@@_compile:n {#1} \@@_build:N \l_@@_internal_regex } \cs_new_protected:Npn \@@_build:N #1 { % \trace_push:nnn { regex } { 1 } { @@_build } \int_set:Nn \tex_escapechar:D { 92 } \int_zero:N \l_@@_capturing_group_int \int_set_eq:NN \l_@@_max_state_int \l_@@_min_state_int \@@_build_new_state: \@@_build_new_state: \@@_toks_put_right:Nn \l_@@_left_state_int { \@@_action_start_wildcard: } \@@_group:nnnN {#1} { 1 } { 0 } \c_false_bool \@@_toks_put_right:Nn \l_@@_right_state_int { \@@_action_success: } % \@@_trace_states:n { 2 } % \trace_pop:nnn { regex } { 1 } { @@_build } } % \end{macrocode} % \end{macro} % % \begin{macro}[int]{\@@_build_for_cs:n} % When using a regex to match a cs, we don't insert a wildcard, we % anchor at the end, and since we ignore submatches, there is no need % to surround the expression with a group. However, for branches to % work properly at the outer level, we need to put the appropriate % \texttt{left} and \texttt{right} states in their sequence. % \begin{macrocode} \cs_new_protected:Npn \@@_build_for_cs:n #1 { % \trace_push:nnn { regex } { 1 } { @@_build_for_cs } \int_set_eq:NN \l_@@_max_state_int \l_@@_min_state_int \@@_build_new_state: \@@_build_new_state: \@@_push_lr_states: #1 \@@_pop_lr_states: \@@_toks_put_right:Nn \l_@@_right_state_int { \if_int_compare:w \l_@@_current_pos_int = \l_@@_max_pos_int \exp_after:wN \@@_action_success: \fi: } % \@@_trace_states:n { 2 } % \trace_pop:nnn { regex } { 1 } { @@_build_for_cs } } % \end{macrocode} % \end{macro} % % \subsubsection{Helpers for building an \textsc{nfa}} % % \begin{macro}[int]{\@@_push_lr_states:, \@@_pop_lr_states:} % When building the regular expression, we keep track of pointers to % the left-end and right-end of each group without help from \TeX{}'s % grouping. % \begin{macrocode} \cs_new_protected_nopar:Npn \@@_push_lr_states: { \seq_push:No \l_@@_left_state_seq { \int_use:N \l_@@_left_state_int } \seq_push:No \l_@@_right_state_seq { \int_use:N \l_@@_right_state_int } } \cs_new_protected_nopar:Npn \@@_pop_lr_states: { \seq_pop:NN \l_@@_left_state_seq \l_@@_internal_a_tl \int_set:Nn \l_@@_left_state_int \l_@@_internal_a_tl \seq_pop:NN \l_@@_right_state_seq \l_@@_internal_a_tl \int_set:Nn \l_@@_right_state_int \l_@@_internal_a_tl } % \end{macrocode} % \end{macro} % % \begin{macro}[int]{\@@_toks_put_left:Nx} % \begin{macro}[int]{\@@_toks_put_right:Nx, \@@_toks_put_right:Nn} % During the building phase we wish to add \texttt{x}-expanded % material to \tn{toks}, either to the left or to the right. The % expansion is done \enquote{by hand} for optimization (these % operations are used quite a lot). The \texttt{Nn} version of % \cs{@@_toks_put_right:Nx} is provided because it is more % efficient than \texttt{x}-expanding with \cs{exp_not:n}. % \begin{macrocode} \cs_new_protected:Npn \@@_toks_put_left:Nx #1#2 { \cs_set_nopar:Npx \@@_tmp:w { #2 } \tex_toks:D #1 \exp_after:wN \exp_after:wN \exp_after:wN { \exp_after:wN \@@_tmp:w \tex_the:D \tex_toks:D #1 } } \cs_new_protected:Npn \@@_toks_put_right:Nx #1#2 { \cs_set_nopar:Npx \@@_tmp:w {#2} \tex_toks:D #1 \exp_after:wN { \tex_the:D \tex_toks:D \exp_after:wN #1 \@@_tmp:w } } \cs_new_protected:Npn \@@_toks_put_right:Nn #1#2 { \tex_toks:D #1 \exp_after:wN { \tex_the:D \tex_toks:D #1 #2 } } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}[int] % { % \@@_build_transition_left:NNN, % \@@_build_transition_right:nNn % } % Add a transition from |#2| to |#3| using the function |#1|. The % \texttt{left} function is used for higher priority transitions, and % the \texttt{right} function for lower priority transitions (which % should be performed later). The signatures differ to reflect the % differing usage later on. Both functions could be optimized. % \begin{macrocode} \cs_new_protected:Npn \@@_build_transition_left:NNN #1#2#3 { \@@_toks_put_left:Nx #2 { #1 { \int_eval:n { #3 - #2 } } } } \cs_new_protected:Npn \@@_build_transition_right:nNn #1#2#3 { \@@_toks_put_right:Nx #2 { #1 { \int_eval:n { #3 - #2 } } } } % \end{macrocode} % \end{macro} % % \begin{macro}[int]{\@@_build_new_state:} % Add a new empty state to the \textsc{nfa}. Then update the % \texttt{left}, \texttt{right}, and \texttt{max} states, so that the % \texttt{right} state is the new empty state, and the \texttt{left} % state points to the previously \enquote{current} state. % \begin{macrocode} \cs_new_protected_nopar:Npn \@@_build_new_state: { %<*trace> \trace:nnx { regex } { 2 } { regex~new~state~ L=\int_use:N \l_@@_left_state_int ~ -> ~ R=\int_use:N \l_@@_right_state_int ~ -> ~ M=\int_use:N \l_@@_max_state_int ~ -> ~ \int_eval:n { \l_@@_max_state_int + \c_one } } % \tex_toks:D \l_@@_max_state_int { } \int_set_eq:NN \l_@@_left_state_int \l_@@_right_state_int \int_set_eq:NN \l_@@_right_state_int \l_@@_max_state_int \int_incr:N \l_@@_max_state_int } % \end{macrocode} % \end{macro} % % \begin{macro}[int]{\@@_build_transitions_lazyness:NNNNN} % This function creates a new state, and puts two transitions starting % from the old current state. The order of the transitions is % controlled by |#1|, true for lazy quantifiers, and false for greedy % quantifiers. % \begin{macrocode} \cs_new_protected:Npn \@@_build_transitions_lazyness:NNNNN #1#2#3#4#5 { \@@_build_new_state: \@@_toks_put_right:Nx \l_@@_left_state_int { \if_meaning:w \c_true_bool #1 #2 { \int_eval:n { #3 - \l_@@_left_state_int } } #4 { \int_eval:n { #5 - \l_@@_left_state_int } } \else: #4 { \int_eval:n { #5 - \l_@@_left_state_int } } #2 { \int_eval:n { #3 - \l_@@_left_state_int } } \fi: } } % \end{macrocode} % \end{macro} % % \subsubsection{Building classes} % % \begin{macro}[int]{\@@_class:NnnnN} % \begin{macro}[int, rEXP]{\@@_tests_action_cost:n} % The arguments are: \meta{boolean} \Arg{tests} \Arg{min} \Arg{more} % \meta{lazyness}. First store the tests with a trailing % \cs{@@_action_cost:n}, in the true branch of % \cs{@@_break_point:TF} for positive classes, or the false branch % for negative classes. The integer \meta{more} is $0$ for fixed % repetitions, $-1$ for unbounded repetitions, and % $\meta{max}-\meta{min}$ for a range of repetitions. % \begin{macrocode} \cs_new_protected:Npn \@@_class:NnnnN #1#2#3#4#5 { \cs_set_nopar:Npx \@@_tests_action_cost:n ##1 { \exp_not:n { \exp_not:n {#2} } \bool_if:NTF #1 { \@@_break_point:TF { \@@_action_cost:n {##1} } { } } { \@@_break_point:TF { } { \@@_action_cost:n {##1} } } } \if_case:w - #4 \exp_stop_f: \@@_class_repeat:n {#3} \or: \@@_class_repeat:nN {#3} #5 \else: \@@_class_repeat:nnN {#3} {#4} #5 \fi: } \cs_new:Npn \@@_tests_action_cost:n { \@@_action_cost:n } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}[aux]{\@@_class_repeat:n} % This is used for a fixed number of repetitions. Build one state for % each repetition, with a transition controlled by the tests that we % have collected. That works just fine for |#1|${}=0$ repetitions: % nothing is built. % \begin{macrocode} \cs_new_protected:Npn \@@_class_repeat:n #1 { \prg_replicate:nn {#1} { \@@_build_new_state: \@@_build_transition_right:nNn \@@_tests_action_cost:n \l_@@_left_state_int \l_@@_right_state_int } } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_class_repeat:nN} % This implements unbounded repetitions of a single class (\emph{e.g.} % the |*| and |+| quantifiers). If the minimum number |#1| of % repetitions is $0$, then build a transition from the current state % to itself governed by the tests, and a free transition to a new % state (hence skipping the tests). Otherwise, call % \cs{@@_class_repeat:n} for the code to match |#1| repetitions, % and add free transitions from the last state to the previous one, % and to a new one. In both cases, the order of transitions is % controlled by the lazyness boolean |#2|. % \begin{macrocode} \cs_new_protected:Npn \@@_class_repeat:nN #1#2 { \if_int_compare:w #1 = \c_zero \@@_build_transitions_lazyness:NNNNN #2 \@@_action_free:n \l_@@_right_state_int \@@_tests_action_cost:n \l_@@_left_state_int \else: \@@_class_repeat:n {#1} \int_set_eq:NN \l_@@_internal_a_int \l_@@_left_state_int \@@_build_transitions_lazyness:NNNNN #2 \@@_action_free:n \l_@@_right_state_int \@@_action_free:n \l_@@_internal_a_int \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_class_repeat:nnN} % We want to build the code to match from |#1| to $|#1|+|#2|$ % repetitions. Match |#1| repetitions (can be $0$). Compute the final % state of the next construction as \texttt{a}. Build $|#2|>0$ states, % each with a transition to the next state governed by the tests, and % a transition to the final state \texttt{a}. The computation of % \texttt{a} is safe because states are allocated in order, starting % from \texttt{max_state}. % \begin{macrocode} \cs_new_protected:Npn \@@_class_repeat:nnN #1#2#3 { \@@_class_repeat:n {#1} \int_set:Nn \l_@@_internal_a_int { \l_@@_max_state_int + #2 - \c_one } \prg_replicate:nn { #2 } { \@@_build_transitions_lazyness:NNNNN #3 \@@_action_free:n \l_@@_internal_a_int \@@_tests_action_cost:n \l_@@_right_state_int } } % \end{macrocode} % \end{macro} % % \subsubsection{Building groups} % % \begin{macro}[aux]{\@@_group_aux:nnnnN} % Arguments: \Arg{label} \Arg{contents} \Arg{min} \Arg{more} % \meta{lazyness}. If \meta{min} is $0$, we need to add a state before % building the group, so that the thread which skips the group does % not also set the start-point of the submatch. After adding one more % state, the \texttt{left_state} is the left end of the group, from % which all branches will stem, and the \texttt{right_state} is the % right end of the group, and all branches end their course in that % state. We store those two integers to be queried for each branch, we % build the \textsc{nfa} states for the contents |#2| of the group, % and we forget about the two integers. Once this is done, perform the % repetition: either exactly |#3| times, or |#3| or more times, or % between |#3| and $|#3|+|#4|$ times, with lazyness |#5|. The % \meta{label} |#1| is used for submatch tracking. Each of the three % auxiliaries expects \texttt{left_state} and \texttt{right_state} to % be set properly. % \begin{macrocode} \cs_new_protected:Npn \@@_group_aux:nnnnN #1#2#3#4#5 { % \trace_push:nnn { regex } { 1 } { @@_group } \if_int_compare:w #3 = \c_zero \@@_build_new_state: %\assert_int:n { \l_@@_max_state_int = \l_@@_right_state_int + 1 } \@@_build_transition_right:nNn \@@_action_free_group:n \l_@@_left_state_int \l_@@_right_state_int \fi: \@@_build_new_state: \@@_push_lr_states: #2 \@@_pop_lr_states: \if_case:w - #4 \exp_stop_f: \@@_group_repeat:nn {#1} {#3} \or: \@@_group_repeat:nnN {#1} {#3} #5 \else: \@@_group_repeat:nnnN {#1} {#3} {#4} #5 \fi: % \trace_pop:nnn { regex } { 1 } { @@_group } } % \end{macrocode} % \end{macro} % % \begin{macro}[int]{\@@_group:nnnN, \@@_group_no_capture:nnnN} % Hand to \cs{@@_group_aux:nnnnnN} the label of that group % (expanded), and the group itself, with some extra commands to % perform. % \begin{macrocode} \cs_new_protected:Npn \@@_group:nnnN #1 { \exp_args:No \@@_group_aux:nnnnN { \int_use:N \l_@@_capturing_group_int } { \int_incr:N \l_@@_capturing_group_int #1 } } \cs_new_protected_nopar:Npn \@@_group_no_capture:nnnN { \@@_group_aux:nnnnN { -1 } } % \end{macrocode} % \end{macro} % % \begin{macro}[int]{\@@_group_resetting:nnnN} % \begin{macro}[aux]{\@@_group_resetting_loop:nnNn} % Again, hand the label $-1$ to \cs{@@_group_aux:nnnnN}, but this % time we work a little bit harder to keep track of the maximum group % label at the end of any branch, and to reset the group number at % each branch. This relies on the fact that a compiled regex always is % a sequence of items of the form \cs{@@_branch:n} \Arg{branch}. % \begin{macrocode} \cs_new_protected:Npn \@@_group_resetting:nnnN #1 { \@@_group_aux:nnnnN { -1 } { \exp_args:Noo \@@_group_resetting_loop:nnNn { \int_use:N \l_@@_capturing_group_int } { \int_use:N \l_@@_capturing_group_int } #1 { ?? \__prg_break:n } { } \__prg_break_point: } } \cs_new_protected:Npn \@@_group_resetting_loop:nnNn #1#2#3#4 { \use_none:nn #3 { \int_set:Nn \l_@@_capturing_group_int {#1} } \int_set:Nn \l_@@_capturing_group_int {#2} #3 {#4} \exp_args:Nf \@@_group_resetting_loop:nnNn { \int_max:nn {#1} { \l_@@_capturing_group_int } } {#2} } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}[int]{\@@_branch:n} % Add a free transition from the left state of the current group to a % brand new state, starting point of this branch. Once the branch is % built, add a transition from its last state to the right state of % the group. The left and right states of the group are extracted from % the relevant sequences. % \begin{macrocode} \cs_new_protected:Npn \@@_branch:n #1 { % \trace_push:nnn { regex } { 1 } { @@_branch } \@@_build_new_state: \seq_get:NN \l_@@_left_state_seq \l_@@_internal_a_tl \int_set:Nn \l_@@_left_state_int \l_@@_internal_a_tl \@@_build_transition_right:nNn \@@_action_free:n \l_@@_left_state_int \l_@@_right_state_int #1 \seq_get:NN \l_@@_right_state_seq \l_@@_internal_a_tl \@@_build_transition_right:nNn \@@_action_free:n \l_@@_right_state_int \l_@@_internal_a_tl % \trace_pop:nnn { regex } { 1 } { @@_branch } } % \end{macrocode} % \end{macro} % % \begin{macro}[int]{\@@_group_repeat:nn} % This function is called to repeat a group a fixed number of times % |#2|; if this is $0$ we remove the group altogether (but don't reset % the \texttt{capturing_group} label). Otherwise, the auxiliary % \cs{@@_group_repeat_aux:n} copies |#2| times the \tn{toks} for % the group, and leaves \texttt{internal_a} pointing to the left end % of the last repetition. We only record the submatch information at % the last repetition. Finally, add a state at the end (the transition % to it has been taken care of by the replicating auxiliary. % \begin{macrocode} \cs_new_protected:Npn \@@_group_repeat:nn #1#2 { \if_int_compare:w #2 = \c_zero \int_set:Nn \l_@@_max_state_int { \l_@@_left_state_int - \c_one } \@@_build_new_state: \else: \@@_group_repeat_aux:n {#2} \@@_group_submatches:nNN {#1} \l_@@_internal_a_int \l_@@_right_state_int \@@_build_new_state: \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_group_submatches:nNN} % This inserts in states |#2| and |#3| the code for tracking % submatches of the group |#1|, unless inhibited by a label of $-1$. % \begin{macrocode} \cs_new_protected:Npn \@@_group_submatches:nNN #1#2#3 { \if_int_compare:w #1 > \c_minus_one \@@_toks_put_left:Nx #2 { \@@_action_submatch:n { #1 < } } \@@_toks_put_left:Nx #3 { \@@_action_submatch:n { #1 > } } \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_group_repeat_aux:n} % Here we repeat \tn{toks} ranging from \texttt{left_state} to % \texttt{max_state}, $|#1|>0$ times. First add a transition so that % the copies will \enquote{chain} properly. Compute the shift % \texttt{c} between the original copy and the last copy we % want. Shift the \texttt{right_state} and \texttt{max_state} to their % final values. We then want to perform \texttt{c} copy operations. At % the end, \texttt{b} is equal to the \texttt{max_state}, and % \texttt{a} points to the left of the last copy of the group. % \begin{macrocode} \cs_new_protected:Npn \@@_group_repeat_aux:n #1 { \@@_build_transition_right:nNn \@@_action_free:n \l_@@_right_state_int \l_@@_max_state_int \int_set_eq:NN \l_@@_internal_a_int \l_@@_left_state_int \int_set_eq:NN \l_@@_internal_b_int \l_@@_max_state_int \if_int_compare:w \__int_eval:w #1 > \c_one \int_set:Nn \l_@@_internal_c_int { ( #1 - \c_one ) * ( \l_@@_internal_b_int - \l_@@_internal_a_int ) } \tex_advance:D \l_@@_right_state_int \l_@@_internal_c_int \tex_advance:D \l_@@_max_state_int \l_@@_internal_c_int \prg_replicate:nn \l_@@_internal_c_int { \tex_toks:D \l_@@_internal_b_int = \tex_toks:D \l_@@_internal_a_int \tex_advance:D \l_@@_internal_a_int \c_one \tex_advance:D \l_@@_internal_b_int \c_one } \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_group_repeat:nnN} % This function is called to repeat a group at least $n$ times; the % case $n=0$ is very different from $n>0$. Assume first that $n=0$. % Insert submatch tracking information at the start and end of the % group, add a free transition from the right end to the % \enquote{true} left state \texttt{a} (remember: in this case we had % added an extra state before the left state). This forms the loop, % which we break away from by adding a free transition from \texttt{a} % to a new state. % % Now consider the case $n>0$. Repeat the group $n$ times, chaining % various copies with a free transition. Add submatch tracking only to % the last copy, then add a free transition from the right end back to % the left end of the last copy, either before or after the transition % to move on towards the rest of the \textsc{nfa}. This transition can % end up before submatch tracking, but that is irrelevant since it % only does so when going again through the group, recording new % matches. Finally, add a state; we already have a transition pointing % to it from \cs{@@_group_repeat_aux:n}. % \begin{macrocode} \cs_new_protected:Npn \@@_group_repeat:nnN #1#2#3 { \if_int_compare:w #2 = \c_zero \@@_group_submatches:nNN {#1} \l_@@_left_state_int \l_@@_right_state_int \int_set:Nn \l_@@_internal_a_int { \l_@@_left_state_int - \c_one } \@@_build_transition_right:nNn \@@_action_free:n \l_@@_right_state_int \l_@@_internal_a_int \@@_build_new_state: \if_meaning:w \c_true_bool #3 \@@_build_transition_left:NNN \@@_action_free:n \l_@@_internal_a_int \l_@@_right_state_int \else: \@@_build_transition_right:nNn \@@_action_free:n \l_@@_internal_a_int \l_@@_right_state_int \fi: \else: \@@_group_repeat_aux:n {#2} \@@_group_submatches:nNN {#1} \l_@@_internal_a_int \l_@@_right_state_int \if_meaning:w \c_true_bool #3 \@@_build_transition_right:nNn \@@_action_free_group:n \l_@@_right_state_int \l_@@_internal_a_int \else: \@@_build_transition_left:NNN \@@_action_free_group:n \l_@@_right_state_int \l_@@_internal_a_int \fi: \@@_build_new_state: \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_group_repeat:nnnN} % We wish to repeat the group between |#2| and $|#2|+|#3|$ times, with % a lazyness controlled by |#4|. We insert submatch tracking up front: % in principle, we could avoid recording submatches for the first |#2| % copies of the group, but that forces us to treat specially the case % $|#2|=0$. Repeat that group with submatch tracking $|#2|+|#3|$ times % (the maximum number of repetitions). Then our goal is to add |#3| % transitions from the end of the |#2|-th group, and each subsequent % groups, to the end. For a lazy quantifier, we add those transitions % to the left states, before submatch tracking. For the greedy case, % we add the transitions to the right states, after submatch tracking % and the transitions which go on with more repetitions. In the greedy % case with $|#2|=0$, the transition which skips over all copies of % the group must be added separately, because its starting state does % not follow the normal pattern: we had to add it \enquote{by hand} % earlier. % \begin{macrocode} \cs_new_protected:Npn \@@_group_repeat:nnnN #1#2#3#4 { \@@_group_submatches:nNN {#1} \l_@@_left_state_int \l_@@_right_state_int \@@_group_repeat_aux:n { #2 + #3 } \if_meaning:w \c_true_bool #4 \int_set_eq:NN \l_@@_left_state_int \l_@@_max_state_int \prg_replicate:nn { #3 } { \int_sub:Nn \l_@@_left_state_int { \l_@@_internal_b_int - \l_@@_internal_a_int } \@@_build_transition_left:NNN \@@_action_free:n \l_@@_left_state_int \l_@@_max_state_int } \else: \prg_replicate:nn { #3 - \c_one } { \int_sub:Nn \l_@@_right_state_int { \l_@@_internal_b_int - \l_@@_internal_a_int } \@@_build_transition_right:nNn \@@_action_free:n \l_@@_right_state_int \l_@@_max_state_int } \if_int_compare:w #2 = \c_zero \int_set:Nn \l_@@_right_state_int { \l_@@_left_state_int - \c_one } \else: \int_sub:Nn \l_@@_right_state_int { \l_@@_internal_b_int - \l_@@_internal_a_int } \fi: \@@_build_transition_right:nNn \@@_action_free:n \l_@@_right_state_int \l_@@_max_state_int \fi: \@@_build_new_state: } % \end{macrocode} % \end{macro} % % \subsubsection{Others} % % \begin{macro}[int]{\@@_assertion:Nn, \@@_b_test:, \@@_anchor:N} % Usage: \cs{@@_assertion:Nn} \meta{boolean} \Arg{test}, where the % \meta{test} is either of the two other functions. Add a free % transition to a new state, conditionally to the assertion test. The % \cs{@@_b_test:} test is used by the |\b| and |\B| escape: check % if the last character was a word character or not, and do the same % to the current character. The boundary-markers of the string are % non-word characters for this purpose. Anchors at the start or end % of match use \cs{@@_anchor:N}, with a position controlled by the % integer |#1|. % \begin{macrocode} \cs_new_protected:Npn \@@_assertion:Nn #1#2 { \@@_build_new_state: \@@_toks_put_right:Nx \l_@@_left_state_int { \exp_not:n {#2} \@@_break_point:TF \bool_if:NF #1 { { } } { \@@_action_free:n { \int_eval:n { \l_@@_right_state_int - \l_@@_left_state_int } } } \bool_if:NT #1 { { } } } } \cs_new_protected:Npn \@@_anchor:N #1 { \if_int_compare:w #1 = \l_@@_current_pos_int \exp_after:wN \@@_break_true:w \fi: } \cs_new_protected_nopar:Npn \@@_b_test: { \group_begin: \int_set_eq:NN \l_@@_current_char_int \l_@@_last_char_int \@@_prop_w: \@@_break_point:TF { \group_end: \@@_item_reverse:n \@@_prop_w: } { \group_end: \@@_prop_w: } } % \end{macrocode} % \end{macro} % % \begin{macro}[int]{\@@_command_K:} % Change the starting point of the $0$-th submatch (full match), and % transition to a new state, pretending that this is a fresh thread. % \begin{macrocode} \cs_new_protected_nopar:Npn \@@_command_K: { \@@_build_new_state: \@@_toks_put_right:Nx \l_@@_left_state_int { \@@_action_submatch:n { 0< } \bool_set_true:N \l_@@_fresh_thread_bool \@@_action_free:n { \int_eval:n { \l_@@_right_state_int - \l_@@_left_state_int } } \bool_set_false:N \l_@@_fresh_thread_bool } } % \end{macrocode} % \end{macro} % % \subsection{Matching} % % We search for matches by running all the execution threads through the % \textsc{nfa} in parallel, reading one token of the query at each step. % The \textsc{nfa} contains \enquote{free} transitions to other states, % and transitions which \enquote{consume} the current token. For free % transitions, the instruction at the new state of the \textsc{nfa} is % performed immediately. When a transition consumes a character, the % new state is appended to a list of \enquote{active states}, stored in % \tn{skip} registers: this thread will be active again when the next % token is read from the query. At every step (for each token in the % query), we unpack that list of active states and the corresponding % submatch props, and empty those. % % If two paths through the \textsc{nfa} \enquote{collide} in the sense % that they reach the same state after reading a given token, then they % only differ in how they previously matched, and the future execution % will be identical for both. (Note that this would be wrong in the % presence of back-references.) Hence, we only need to keep one of the % two threads: the thread with the highest priority. Our \textsc{nfa} is % built in such a way that higher priority actions always come before % lower priority actions, which makes things work. % % The explanation in the previous paragraph may make us think that we % simply need to keep track of which states were visited at a given % step: after all, the loop generated when matching |(a?)*| against |a| % is broken, isn't it? No. The group first matches |a|, as it should, % then repeats; it attempts to match |a| again but fails; it skips |a|, % and finds out that this state has already been seen at this position % in the query: the match stops. The capturing group is (wrongly) |a|. % What went wrong is that a thread collided with itself, and the later % version, which has gone through the group one more times with an empty % match, should have a higher priority than not going through the group. % % We solve this by distinguishing \enquote{normal} free transitions % \cs{@@_action_free:n} from transitions % \cs{@@_action_free_group:n} which go back to the start of the % group. The former will keep threads unless they have been visited by a % \enquote{completed} thread, while the latter kind of transition also % prevents going back to a state visited by the current thread. % % \subsubsection{Variables used when matching} % % \begin{variable} % { % \l_@@_min_pos_int, % \l_@@_max_pos_int, % \l_@@_current_pos_int, % \l_@@_start_pos_int, % \l_@@_success_pos_int, % } % The tokens in the query are indexed from \texttt{min_pos} for the % first to $\texttt{max_pos}-1$ for the last, and their information is % stored in \tn{muskip} and \tn{toks} registers with those numbers. We % don't start from $0$ because the \tn{toks} registers with low % numbers are used to hold the states of the \textsc{nfa}. We match % without backtracking, keeping all threads in lockstep at the % \texttt{current_pos} in the query. The starting point of the current % match attempt is \texttt{start_pos}, and \texttt{success_pos}, % updated whenever a thread succeeds, is used as the next starting % position. % \begin{macrocode} \int_new:N \l_@@_min_pos_int \int_new:N \l_@@_max_pos_int \int_new:N \l_@@_current_pos_int \int_new:N \l_@@_start_pos_int \int_new:N \l_@@_success_pos_int % \end{macrocode} % \end{variable} % % \begin{variable} % { % \l_@@_current_char_int, % \l_@@_current_catcode_int, % \l_@@_last_char_int, % \l_@@_case_changed_char_int % } % The character and category codes of the token at the current % position; the character code of the token at the previous position; % and the character code of the result of changing the case of the % current token (|A-Z|$\leftrightarrow$|a-z|). This last integer is % only computed when necessary, and is otherwise \cs{c_max_int}. The % \texttt{current_char} variable is also used in various other phases % to hold a character code. % \begin{macrocode} \int_new:N \l_@@_current_char_int \int_new:N \l_@@_current_catcode_int \int_new:N \l_@@_last_char_int \int_new:N \l_@@_case_changed_char_int % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_current_state_int} % For every character in the token list, each of the active states is % considered in turn. The variable \cs{l_@@_current_state_int} % holds the state of the \textsc{nfa} which is currently considered: % transitions are then given as shifts relative to the current state. % \begin{macrocode} \int_new:N \l_@@_current_state_int % \end{macrocode} % \end{variable} % % \begin{variable} % {\l_@@_current_submatches_prop, \l_@@_success_submatches_prop} % The submatches for the thread which is currently active are stored % in the \texttt{current_submatches} property list variable. This % property list is stored by \cs{@@_action_cost:n} into the % \tn{toks} register for the target state of the transition, to be % retrieved when matching at the next position. When a thread % succeeds, this property list is copied to % \cs{l_@@_success_submatches_prop}: only the last successful thread % will remain there. % \begin{macrocode} \prop_new:N \l_@@_current_submatches_prop \prop_new:N \l_@@_success_submatches_prop % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_step_int} % This integer, always even, is increased every time a character in % the query is read, and not reset when doing multiple matches. For % each \meta{state} in the \textsc{nfa} we store in % \tn{dimen}\meta{state} the last step in which this state was % encountered. This lets us break infinite loops by not visiting the % same state twice in the same step. In fact, \tn{dimen}\meta{state} % is equal \texttt{step} when we have started performing the % operations of \tn{toks}\meta{state}, but not finished yet. However, % once we finish, we set \tn{dimen}\meta{state} to % $\text{\texttt{step}}+1$. This is needed to track submatches % properly (see building phase). The \texttt{step} is also used to % attach each set of submatch information to a given iteration (and % automatically discard it when it corresponds to a past step). % \begin{macrocode} \int_new:N \l_@@_step_int % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_min_active_int, \l_@@_max_active_int} % All the currently active states are kept in order of precedence in % the \tn{skip} registers, and the corresponding submatches in the % \tn{toks}. For our purposes, those serve as an array, indexed from % \texttt{min_active} (inclusive) to \texttt{max_active} (excluded). % At the start of every step, the whole array is unpacked, so that the % space can immediately be reused, and \texttt{max_active} is reset to % \texttt{min_active}, effectively clearing the array. % \begin{macrocode} \int_new:N \l_@@_min_active_int \int_new:N \l_@@_max_active_int % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_every_match_tl} % Every time a match is found, this token list is used. For single % matching, the token list is empty. For multiple matching, the token % list is set to repeat the matching, after performing some operation % which depends on the user function. See \cs{@@_single_match:} and % \cs{@@_multi_match:n}. % \begin{macrocode} \tl_new:N \l_@@_every_match_tl % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_fresh_thread_bool, \l_@@_empty_success_bool} % \begin{macro}[aux]{\@@_if_two_empty_matches:F} % When doing multiple matches, we need to avoid infinite loops where % each iteration matches the same empty token list. When an empty % token list is matched, the next successful match of the same empty % token list is suppressed. We detect empty matches by setting % \cs{l_@@_fresh_thread_bool} to \texttt{true} for threads which % directly come from the start of the regex or from the |\K| command, % and testing that boolean whenever a thread succeeds. The function % \cs{@@_if_two_empty_matches:F} is redefined at every match % attempt, depending on whether the previous match was empty or not: % if it was, then the function must cancel a purported success if it % is empty and at the same spot as the previous match; otherwise, we % definitely don't have two identical empty matches, so the function % is \cs{use:n}. % \begin{macrocode} \bool_new:N \l_@@_fresh_thread_bool \bool_new:N \l_@@_empty_success_bool \cs_new_eq:NN \@@_if_two_empty_matches:F \use:n % \end{macrocode} % \end{macro} % \end{variable} % % \begin{variable} % { % \g_@@_success_bool, % \l_@@_saved_success_bool, % \l_@@_match_success_bool % } % The boolean \cs{l_@@_match_success_bool} is true if the current % match attempt was successful, and \cs{g_@@_success_bool} is true % if there was at least one successful match. This is the only global % variable in this whole module, but we would need it to be local when % matching a control sequence with |\c{...}|. This is done by saving % the global variable into \cs{l_@@_saved_success_bool}, which is % local, hence not affected by the changes due to inner regex % functions. % \begin{macrocode} \bool_new:N \g_@@_success_bool \bool_new:N \l_@@_saved_success_bool \bool_new:N \l_@@_match_success_bool % \end{macrocode} % \end{variable} % % \subsubsection{Matching: framework} % % \begin{macro}[int]{\@@_match:n} % First store the query into \tn{toks} and \tn{muskip} registers (see % \cs{@@_query_set:nnn}). Then initialize the variables that should % be set once for each user function (even for multiple % matches). Namely, the overall matching is not yet successful; none of % the states should be marked as visited (\tn{dimen} registers), and % we start at step $0$; we pretend that there was a previous match % ending at the start of the query, which was not empty (to avoid % smothering an empty match at the start). Once all this is set up, we % are ready for the ride. Find the first match. % \begin{macrocode} \cs_new_protected:Npn \@@_match:n #1 { % \trace_push:nnx { regex } { 1 } { @@_match } % \trace:nnx { regex } { 1 } { analyzing~query~token~list } \int_zero:N \l_@@_balance_int \int_set:Nn \l_@@_current_pos_int { \c_two * \l_@@_max_state_int } \@@_query_set:nnn { } { -1 } { -2 } \int_set_eq:NN \l_@@_min_pos_int \l_@@_current_pos_int \__tl_analysis_map_inline:nn {#1} { \@@_query_set:nnn {##1} {"##2} {##3} } \int_set_eq:NN \l_@@_max_pos_int \l_@@_current_pos_int \@@_query_set:nnn { } { -1 } { -2 } % \trace:nnx { regex } { 1 } { initializing } \bool_gset_false:N \g_@@_success_bool \int_step_inline:nnnn \l_@@_min_state_int \c_one { \l_@@_max_state_int - \c_one } { \tex_dimen:D ##1 \c_one sp \scan_stop: } \int_set_eq:NN \l_@@_min_active_int \l_@@_max_state_int \int_set_eq:NN \l_@@_step_int \c_zero \int_set_eq:NN \l_@@_success_pos_int \l_@@_min_pos_int \int_set:Nn \l_@@_submatch_int { \c_two * \l_@@_max_state_int } \bool_set_false:N \l_@@_empty_success_bool \@@_match_once: % \trace_pop:nnx { regex } { 1 } { @@_match } } % \end{macrocode} % \end{macro} % % \begin{macro}[int]{\@@_match_once:} % This function finds one match, then does some action defined by the % \texttt{every_match} token list, which may recursively call % \cs{@@_match_once:}. First initialize some variables: set the % conditional which detects identical empty matches; this match % attempt starts at the previous \texttt{success_pos}, is not yet % successful, and has no submatches yet; clear the array of active % threads, and put the starting state $0$ in it. We are then almost % ready to read our first token in the query, but we actually start % one position earlier than the start, and \texttt{get} that token, so % that the \texttt{last_char} will be set properly for word % boundaries. Then call \cs{@@_match_loop:}, which runs through the % query until the end or until a successful match breaks early. % \begin{macrocode} \cs_new_protected_nopar:Npn \@@_match_once: { \if_meaning:w \c_true_bool \l_@@_empty_success_bool \cs_set_nopar:Npn \@@_if_two_empty_matches:F { \int_compare:nNnF \l_@@_start_pos_int = \l_@@_current_pos_int } \else: \cs_set_eq:NN \@@_if_two_empty_matches:F \use:n \fi: \int_set_eq:NN \l_@@_start_pos_int \l_@@_success_pos_int \bool_set_false:N \l_@@_match_success_bool \prop_clear:N \l_@@_current_submatches_prop \int_set_eq:NN \l_@@_max_active_int \l_@@_min_active_int \@@_store_state:n { \l_@@_min_state_int } \int_set:Nn \l_@@_current_pos_int { \l_@@_start_pos_int - \c_one } \@@_query_get: \@@_match_loop: \l_@@_every_match_tl } % \end{macrocode} % \end{macro} % % \begin{macro}[int]{\@@_single_match:, \@@_multi_match:n} % For a single match, the overall success is determined by whether the % only match attempt is a success. When doing multiple matches, the % overall matching is successful as soon as any match % succeeds. Perform the action |#1|, then find the next match. % \begin{macrocode} \cs_new_protected_nopar:Npn \@@_single_match: { \tl_set:Nn \l_@@_every_match_tl { \bool_gset_eq:NN \g_@@_success_bool \l_@@_match_success_bool } } \cs_new_protected:Npn \@@_multi_match:n #1 { \tl_set:Nn \l_@@_every_match_tl { \if_meaning:w \c_true_bool \l_@@_match_success_bool \bool_gset_true:N \g_@@_success_bool #1 \exp_after:wN \@@_match_once: \fi: } } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_match_loop:} % \begin{macro}[aux, rEXP]{\@@_match_one_active:w} % At each new position, set some variables and get the new character % and category from the query. Then unpack the array of active % threads, and clear it by resetting its length % (\texttt{max_active}). This results in a sequence of % \cs{@@_use_state_and_submatches:nn} \Arg{state} \Arg{prop}, and % we consider those states one by one in order. As soon as a thread % succeeds, exit the step, and, if there are threads to consider at the % next position, and we have not reached the end of the string, % repeat the loop. Otherwise, the last thread that succeeded is what % \cs{@@_match_once:} matches. We explain the \texttt{fresh_thread} % business when describing \cs{@@_action_wildcard:}. % \begin{macrocode} \cs_new_protected_nopar:Npn \@@_match_loop: { \tex_advance:D \l_@@_step_int \c_two \int_incr:N \l_@@_current_pos_int \int_set_eq:NN \l_@@_last_char_int \l_@@_current_char_int \int_set_eq:NN \l_@@_case_changed_char_int \c_max_int \@@_query_get: \use:x { \int_set_eq:NN \l_@@_max_active_int \l_@@_min_active_int \exp_after:wN \@@_match_one_active:w \int_use:N \l_@@_min_active_int ; } \__prg_break_point: \bool_set_false:N \l_@@_fresh_thread_bool %^^A was arg of break_point:n \if_int_compare:w \l_@@_max_active_int > \l_@@_min_active_int \if_int_compare:w \l_@@_current_pos_int < \l_@@_max_pos_int \exp_after:wN \exp_after:wN \exp_after:wN \@@_match_loop: \fi: \fi: } \cs_new:Npn \@@_match_one_active:w #1; { \if_int_compare:w #1 < \l_@@_max_active_int \@@_use_state_and_submatches:nn { \__int_value:w \tex_skip:D #1 } { \tex_the:D \tex_toks:D #1 } \exp_after:wN \@@_match_one_active:w \int_use:N \__int_eval:w #1 + \c_one \exp_after:wN ; \fi: } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}[aux]{\@@_query_set:nnn} % The arguments are: tokens that \texttt{o} and \texttt{x} expand to % one token of the query, the catcode, and the character code. Store % those, and the current brace balance (used later to check for % overall brace balance) in a \tn{muskip} register and a \tn{toks}, % then update the \texttt{balance}. % \begin{macrocode} \cs_new_protected:Npn \@@_query_set:nnn #1#2#3 { \tex_muskip:D \l_@@_current_pos_int = \etex_gluetomu:D #3 sp plus #2 sp minus \l_@@_balance_int sp \scan_stop: \tex_toks:D \l_@@_current_pos_int {#1} \int_incr:N \l_@@_current_pos_int \if_case:w #2 \exp_stop_f: \or: \int_incr:N \l_@@_balance_int \or: \int_decr:N \l_@@_balance_int \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_query_get:} % Extract the current character and category codes from the % \tn{muskip} register of the current position: those are the main and % the stretch components, and we need a conversion to avoid \TeX{}'s % \enquote{incompatible glue units} error. % \begin{macrocode} \cs_new_protected_nopar:Npn \@@_query_get: { \l_@@_current_char_int = \etex_mutoglue:D \tex_muskip:D \l_@@_current_pos_int \l_@@_current_catcode_int = \etex_gluestretch:D \etex_mutoglue:D \tex_muskip:D \l_@@_current_pos_int } % \end{macrocode} % \end{macro} % % \subsubsection{Using states of the \textsc{nfa}} % % \begin{macro}[int]{\@@_use_state:} % Use the current \textsc{nfa} instruction. The state is initially % marked as belonging to the current \texttt{step}: this allows normal % free transition to repeat, but group-repeating transitions % won't. Once we are done exploring all the branches it spawned, the % state is marked as $\texttt{step}+1$: any thread hitting it at that % point will be terminated. % \begin{macrocode} \cs_new_protected_nopar:Npn \@@_use_state: { %<*trace> \trace:nnx { regex } { 2 } { state~\int_use:N \l_@@_current_state_int } % \tex_dimen:D \l_@@_current_state_int = \l_@@_step_int sp \scan_stop: \tex_the:D \tex_toks:D \l_@@_current_state_int \tex_dimen:D \l_@@_current_state_int = \__int_eval:w \l_@@_step_int + \c_one \__int_eval_end: sp \scan_stop: } % \end{macrocode} % \end{macro} % % \begin{macro}[int]{\@@_use_state_and_submatches:nn} % This function is called as one item in the array of active threads % after that array has been unpacked for a new step. Update the % \texttt{current_state} and \texttt{current_submatches} and use the % state if it has not yet been encountered at this step. % \begin{macrocode} \cs_new_protected:Npn \@@_use_state_and_submatches:nn #1 #2 { \int_set:Nn \l_@@_current_state_int {#1} \if_int_compare:w \tex_dimen:D \l_@@_current_state_int < \l_@@_step_int \tl_set:Nn \l_@@_current_submatches_prop {#2} \exp_after:wN \@@_use_state: \fi: \scan_stop: } % \end{macrocode} % \end{macro} % % \subsubsection{Actions when matching} % % \begin{macro}[int]{\@@_action_start_wildcard:} % For an unanchored match, state $0$ has a free transition to the next % and a costly one to itself, to repeat at the next position. To catch % repeated identical empty matches, we need to know if a successful % thread corresponds to an empty match. The instruction resetting % \cs{l_@@_fresh_thread_bool} may be skipped by a successful % thread, hence we had to add it to \cs{@@_match_loop:} too. % \begin{macrocode} \cs_new_protected_nopar:Npn \@@_action_start_wildcard: { \bool_set_true:N \l_@@_fresh_thread_bool \@@_action_free:n {1} \bool_set_false:N \l_@@_fresh_thread_bool \@@_action_cost:n {0} } % \end{macrocode} % \end{macro} % % \begin{macro}[int]{\@@_action_free:n, \@@_action_free_group:n} % \begin{macro}[aux]{\@@_action_free_aux:nn} % These functions copy a thread after checking that the \textsc{nfa} % state has not already been used at this position. If not, store % submatches in the new state, and insert the instructions for that % state in the input stream. Then restore the old value of % \cs{l_@@_current_state_int} and of the current submatches. The % two types of free transitions differ by how they test that the state % has not been encountered yet: the \texttt{group} version is % stricter, and will not use a state if it was used earlier in the % current thread, hence forcefully breaking the loop, while the % \enquote{normal} version will revisit a state when within the thread % itself. % \begin{macrocode} \cs_new_protected_nopar:Npn \@@_action_free:n { \@@_action_free_aux:nn { > \l_@@_step_int \else: } } \cs_new_protected_nopar:Npn \@@_action_free_group:n { \@@_action_free_aux:nn { < \l_@@_step_int } } \cs_new_protected:Npn \@@_action_free_aux:nn #1#2 { \use:x { \int_add:Nn \l_@@_current_state_int {#2} \exp_not:n { \if_int_compare:w \tex_dimen:D \l_@@_current_state_int #1 \exp_after:wN \@@_use_state: \fi: } \int_set:Nn \l_@@_current_state_int { \int_use:N \l_@@_current_state_int } \tl_set:Nn \exp_not:N \l_@@_current_submatches_prop { \exp_not:o \l_@@_current_submatches_prop } } } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}[int]{\@@_action_cost:n} % A transition which consumes the current character and shifts the % state by |#1|. The resulting state is stored in the \tn{skip} array % for use at the next position, and we also store the current % submatches. % \begin{macrocode} \cs_new_protected:Npn \@@_action_cost:n #1 { \exp_args:No \@@_store_state:n { \int_use:N \__int_eval:w \l_@@_current_state_int + #1 } } % \end{macrocode} % \end{macro} % % \begin{macro}[int]{\@@_store_state:n} % \begin{macro}[aux]{\@@_store_submatches:} % Put the given state in the array of \tn{skip} registers (converted % to a dimension in scaled points), and increment the length of the % array. Then store the current submatch in the This is done by % increasing the pointer \cs{l_@@_max_active_int}, and converting % the integer to a dimension (suitable for a \tn{skip} assignment) in % scaled points. % \begin{macrocode} \cs_new_protected:Npn \@@_store_state:n #1 { \@@_store_submatches: \tex_skip:D \l_@@_max_active_int = #1 sp \scan_stop: \int_incr:N \l_@@_max_active_int } \cs_new_protected_nopar:Npn \@@_store_submatches: { \tex_toks:D \l_@@_max_active_int \exp_after:wN { \l_@@_current_submatches_prop } } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}[int]{\@@_disable_submatches:} % Some user functions don't require tracking submatches. % We get a performance improvement by simply defining the % relevant functions to remove their argument and do nothing % with it. % \begin{macrocode} \cs_new_protected_nopar:Npn \@@_disable_submatches: { \cs_set_protected_nopar:Npn \@@_store_submatches: { } \cs_set_protected:Npn \@@_action_submatch:n ##1 { } } % \end{macrocode} % \end{macro} % % \begin{macro}[int]{\@@_action_submatch:n} % Update the current submatches with the information from the current % position. Maybe a bottleneck. % \begin{macrocode} \cs_new_protected:Npn \@@_action_submatch:n #1 { \prop_put:Nno \l_@@_current_submatches_prop {#1} { \int_use:N \l_@@_current_pos_int } } % \end{macrocode} % \end{macro} % % \begin{macro}[int]{\@@_action_success:} % There is a successful match when an execution path reaches the last % state in the \textsc{nfa}, unless this marks a second identical % empty match. Then mark that there was a successful match; it is % empty if it is \enquote{fresh}; and we store the current position % and submatches. The current step is then interrupted with % \cs{__prg_break:}, and only paths with higher precedence are % pursued further. The values stored here may be overwritten by a % later success of a path with higher precedence. % \begin{macrocode} \cs_new_protected_nopar:Npn \@@_action_success: { \@@_if_two_empty_matches:F { \bool_set_true:N \l_@@_match_success_bool \bool_set_eq:NN \l_@@_empty_success_bool \l_@@_fresh_thread_bool \int_set_eq:NN \l_@@_success_pos_int \l_@@_current_pos_int \prop_set_eq:NN \l_@@_success_submatches_prop \l_@@_current_submatches_prop \__prg_break: } } % \end{macrocode} % \end{macro} % % \subsection{Replacement} % % \subsubsection{Variables and helpers used in replacement} % % \begin{variable}{\l_@@_replacement_csnames_int} % The behaviour of closing braces inside a replacement text depends on % whether a sequences |\c{| or |\u{| has been encountered. The number % of \enquote{open} such sequences that should be closed by |}| is % stored in \cs{l_@@_replacement_csnames_int}, and decreased by % $1$ by each |}|. % \begin{macrocode} \int_new:N \l_@@_replacement_csnames_int % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_balance_tl} % This token list holds the replacement text for % \cs{@@_replacement_balance_one_match:n} while it is being built % incrementally. % \begin{macrocode} \tl_new:N \l_@@_balance_tl % \end{macrocode} % \end{variable} % % \begin{macro}[aux, rEXP]{\@@_replacement_balance_one_match:n} % This expects as an argument the first index of a range of \tn{skip} % registers which hold the submatch information for a given match. It % can be used within an integer expression to obtain the brace balance % incurred by performing the replacement on that match. This combines % the braces lost by removing the match, braces added by all the % submatches appearing in the replacement, and braces appearing % explicitly in the replacement. Even though it is always redefined % before use, we initialize it as for an empty replacement. An % important property is that concatenating several calls to that % function must result in a valid integer expression (hence a leading % |+| in the actual definition). % \begin{macrocode} \cs_new:Npn \@@_replacement_balance_one_match:n #1 { - \@@_submatch_balance:n {#1} } % \end{macrocode} % \end{macro} % % \begin{macro}[aux, rEXP]{\@@_replacement_do_one_match:n} % The input is the same as \cs{@@_replacement_balance_one_match:n}. % This function is redefined to expand to the part of the token list % from the end of the previous match to a given match, followed by the % replacement text. Hence concatenating the result of this function % with all possible arguments (one call for each match), as well as % the range from the end of the last match to the end of the string, % will produce the fully replaced token list. The initialization does % not matter, but we set it as for an empty replacement. % \begin{macrocode} \cs_new:Npn \@@_replacement_do_one_match:n #1 { \@@_query_range:nn { \etex_glueshrink:D \tex_skip:D #1 } { \tex_skip:D #1 } } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_replacement_exp_not:N} % This function lets us navigate around the fact that the primitive % \cs{exp_not:n} requires a braced argument. As far as I can tell, it % is only needed if the user tries to include in the replacement text % a control sequence set equal to a macro parameter character, such as % \cs{c_parameter_token}. Indeed, within an \texttt{x}-expanding % assignment, \cs{exp_not:N}~|#| behaves as a single |#|, whereas % \cs{exp_not:n}~|{#}| behaves as a doubled |##|. % \begin{macrocode} \cs_new:Npn \@@_replacement_exp_not:N #1 { \exp_not:n {#1} } % \end{macrocode} % \end{macro} % % \subsubsection{Query and brace balance} % % \begin{macro}[int, rEXP]{\@@_query_range:nn} % \begin{macro}[aux, rEXP]{\@@_query_range_loop:ww} % When it is time to extract submatches from the token list, the % various tokens are stored in \tn{toks} registers numbered from % \cs{l_@@_min_pos_int} inclusive to \cs{l_@@_max_pos_int} % exclusive. The function \cs{@@_query_range:nn} \Arg{min} % \Arg{max} unpacks registers from the position \meta{min} to the % position $\meta{max}-1$ included. Once this is expanded, a second % \texttt{x}-expansion will result in the actual tokens from the % query. That second expansion is only done by user functions at the % very end of their operation, after checking (and correcting) the % brace balance first. % \begin{macrocode} \cs_new:Npn \@@_query_range:nn #1#2 { \exp_after:wN \@@_query_range_loop:ww \int_use:N \__int_eval:w #1 \exp_after:wN ; \int_use:N \__int_eval:w #2 ; \__prg_break_point: } \cs_new:Npn \@@_query_range_loop:ww #1 ; #2 ; { \if_int_compare:w #1 < #2 \exp_stop_f: \else: \exp_after:wN \__prg_break: \fi: \tex_the:D \tex_toks:D #1 \exp_stop_f: \exp_after:wN \@@_query_range_loop:ww \int_use:N \__int_eval:w #1 + \c_one ; #2 ; } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}[int]{\@@_query_submatch:n} % When this function is called, \tn{skip}$i$ holds the start and end % positions for the $i$-th overall submatch as its main and stretch % components. In the case of repeated matches, submatches from all the % matches are put one after the other in blocks of % \cs{l_@@_capturing_group_int} \tn{skip} registers. % \begin{macrocode} \cs_new:Npn \@@_query_submatch:n #1 { \@@_query_range:nn { \tex_skip:D \__int_eval:w #1 } { \etex_gluestretch:D \tex_skip:D \__int_eval:w #1 } } % \end{macrocode} % \end{macro} % % \begin{macro}[rEXP]{\@@_submatch_balance:n} % Every user function must result in a balanced token list (unbalanced % token lists cannot be stored by TeX). When we unpacked the query, we % kept track of the brace balance as the shrink component of % \tn{muskip} registers, hence the contribution from a given range is % the difference between the shrink components of % \tn{muskip}\meta{max~pos} and \tn{muskip}\meta{min~pos}. For the % $i$-th submatch, the end-points of the range are the main and % stretch components of \tn{skip}$i$. The trailing \cs{scan_stop:} is % gobbled by \cs{etex_muexpr:D}, and the whole expression can be cast % safely to an integer (no trailing expansion). % \begin{macrocode} \cs_new_protected:Npn \@@_submatch_balance:n #1 { \etex_glueshrink:D \etex_mutoglue:D \etex_muexpr:D \tex_muskip:D \etex_gluestretch:D \tex_skip:D #1 - \tex_muskip:D \tex_skip:D #1 \scan_stop: } % \end{macrocode} % \end{macro} % % \subsubsection{Framework} % % \begin{macro}[int]{\@@_replacement:n} % \begin{macro}[aux]{\@@_replacement_aux:n} % The replacement text is built incrementally by abusing \tn{toks} % within a group (see \pkg{l3tl-build}). We keep track in % \cs{l_@@_balance_int} of the balance of explicit begin- and % end-group tokens and \cs{l_@@_balance_tl} will consist of some % code to compute the brace balance from submatches (see its % description). Detect unescaped right braces, and escaped characters, % with trailing \cs{prg_do_nothing:} because some of the later % function look-ahead. Once the whole replacement text has been % parsed, make sure that there is no open csname. Finally, define the % \texttt{balance_one_match} and \texttt{do_one_match} functions. % \begin{macrocode} \cs_new_protected:Npn \@@_replacement:n #1 { % \trace_push:nnn { regex } { 1 } { @@_replacement:n } \__tl_build:Nw \l_@@_internal_a_tl \int_zero:N \l_@@_balance_int \tl_clear:N \l_@@_balance_tl \@@_escape_use:nnnn { \if_charcode:w \c_right_brace_str ##1 \@@_replacement_rbrace:N \else: \__tl_build_one:n \fi: ##1 } { \@@_replacement_escaped:N ##1 } { \__tl_build_one:n ##1 } {#1} \prg_do_nothing: \prg_do_nothing: \if_int_compare:w \l_@@_replacement_csnames_int > \c_zero \__msg_kernel_error:nnx { regex } { replacement-missing-rbrace } { \int_use:N \l_@@_replacement_csnames_int } \__tl_build_one:x { \prg_replicate:nn \l_@@_replacement_csnames_int \cs_end: } \fi: \cs_gset:Npx \@@_replacement_balance_one_match:n ##1 { + \int_use:N \l_@@_balance_int \l_@@_balance_tl - \@@_submatch_balance:n {##1} } \__tl_build_end: \exp_args:No \@@_replacement_aux:n \l_@@_internal_a_tl % \trace_pop:nnn { regex } { 1 } { @@_replacement:n } } \cs_new_protected:Npn \@@_replacement_aux:n #1 { \cs_set:Npn \@@_replacement_do_one_match:n ##1 { \@@_query_range:nn { \etex_glueshrink:D \tex_skip:D ##1 } { \tex_skip:D ##1 } #1 } } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}[aux]{\@@_replacement_escaped:N} % As in parsing a regular expression, we use an auxiliary built from % |#1| if defined. Otherwise, check for escaped digits (standing from % submatches from $0$ to $9$): anything else is a raw character. % We use \cs{token_to_str:N} to give spaces the right category code. % \begin{macrocode} \cs_new_protected:Npn \@@_replacement_escaped:N #1 { \cs_if_exist_use:cF { @@_replacement_#1:w } { \if_int_compare:w \c_one < 1#1 \exp_stop_f: \@@_replacement_put_submatch:n {#1} \else: \__tl_build_one:o { \token_to_str:N #1 } \fi: } } % \end{macrocode} % \end{macro} % % \subsubsection{Submatches} % % \begin{macro}[aux]{\@@_replacement_put_submatch:n} % Insert a submatch in the replacement text. This is dropped if the % submatch number is larger than the number of capturing groups. % Unless the submatch appears inside a |\c{...}| or |\u{...}| % construction, it must be taken into account in the brace balance. % Here, |##1| will receive a pointer to the $0$-th submatch for a % given match. We cannot use \cs{int_eval:n} because it is % expandable, and would be expanded too early (short of adding % \cs{exp_not:N}, making the code messy again). % \begin{macrocode} \cs_new_protected:Npn \@@_replacement_put_submatch:n #1 { \if_int_compare:w #1 < \l_@@_capturing_group_int \__tl_build_one:n { \@@_query_submatch:n { #1 + ##1 } } \if_int_compare:w \l_@@_replacement_csnames_int = \c_zero \tl_put_right:Nn \l_@@_balance_tl { + \@@_submatch_balance:n { \__int_eval:w #1+##1 \__int_eval_end: } } \fi: \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_replacement_g:w, \@@_replacement_g_digits:NN} % An ugly method to grab digits for the |\g| escape sequence. At the % end of the run of digits, check that it ends with a right brace. % \begin{macrocode} \cs_new_protected:Npn \@@_replacement_g:w #1#2 { \str_if_eq_x:nnTF { #1#2 } { \__tl_build_one:n \c_left_brace_str } { \int_zero:N \l_@@_internal_a_int \@@_replacement_g_digits:NN } { \@@_replacement_error:NNN g #1 #2 } } \cs_new_protected:Npn \@@_replacement_g_digits:NN #1#2 { \token_if_eq_meaning:NNTF #1 \__tl_build_one:n { \if_int_compare:w \c_one < 1#2 \exp_stop_f: \int_set:Nn \l_@@_internal_a_int { \c_ten * \l_@@_internal_a_int + #2 } \exp_after:wN \use_i:nnn \exp_after:wN \@@_replacement_g_digits:NN \else: \exp_after:wN \@@_replacement_error:NNN \exp_after:wN g \fi: } { \if_meaning:w \@@_replacement_rbrace:N #1 \exp_args:No \@@_replacement_put_submatch:n { \int_use:N \l_@@_internal_a_int } \exp_after:wN \use_none:nn \else: \exp_after:wN \@@_replacement_error:NNN \exp_after:wN g \fi: } #1 #2 } % \end{macrocode} % \end{macro} % % \subsubsection{Csnames in replacement} % % \begin{macro}[aux]{\@@_replacement_c:w} % \begin{macro}[aux]+\@@_replacement_c_{:w+ % |\c| can be followed by a left brace, or by a letter for which we % have defined a way to produce that category of characters. The % appropriate definitions for catcodes are introduced later. For % control sequences, if we are within a control sequence, convert % the token list to a string, otherwise simply prevent expansion, % with a weird cross-over between \cs{exp_not:n} and \cs{exp_not:N} % (see this helper's description for an explanation). % \begin{macrocode} \cs_new_protected:Npn \@@_replacement_c:w #1#2 { \token_if_eq_meaning:NNTF #1 \__tl_build_one:n { \cs_if_exist_use:cF { @@_replacement_c_#2:w } { \@@_replacement_error:NNN c #1#2 } } { \@@_replacement_error:NNN c #1#2 } } \cs_new_protected_nopar:cpn { @@_replacement_c_ \c_left_brace_str :w } { \if_case:w \l_@@_replacement_csnames_int \__tl_build_one:n { \exp_not:n { \exp_after:wN \@@_replacement_exp_not:N \cs:w } } \else: \__tl_build_one:n { \exp_not:n { \exp_after:wN \tl_to_str:N \cs:w } } \fi: \int_incr:N \l_@@_replacement_csnames_int } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}[aux]{\@@_replacement_u:w} % Check that |\u| is followed by a left brace. If so, start a control % sequence with \cs{cs:w}, which is then unpacked either with % \cs{exp_not:V} or \cs{tl_to_str:V} depending on the current context. % \begin{macrocode} \cs_new_protected:Npn \@@_replacement_u:w #1#2 { \str_if_eq_x:nnTF { #1#2 } { \__tl_build_one:n \c_left_brace_str } { \if_case:w \l_@@_replacement_csnames_int \__tl_build_one:n { \exp_not:n { \exp_after:wN \exp_not:V \cs:w } } \else: \__tl_build_one:n { \exp_not:n { \exp_after:wN \tl_to_str:V \cs:w } } \fi: \int_incr:N \l_@@_replacement_csnames_int } { \@@_replacement_error:NNN u #1#2 } } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_replacement_rbrace:N} % Within a |\c{...}| or |\u{...}| construction, end the control % sequence, and decrease the brace count. Otherwise, this is a raw % right brace. % \begin{macrocode} \cs_new_protected:Npn \@@_replacement_rbrace:N #1 { \if_int_compare:w \l_@@_replacement_csnames_int > \c_zero \__tl_build_one:n \cs_end: \int_decr:N \l_@@_replacement_csnames_int \else: \__tl_build_one:n #1 \fi: } % \end{macrocode} % \end{macro} % % \subsubsection{Characters in replacement} % % We will need to change the category code of the null character many % times, hence work in a group. The catcode-specific macros below are % defined in alphabetical order; if you are trying to understand the % code, start from the end of the alphabet as those categories are % simpler than active or begin-group. % \begin{macrocode} \group_begin: % \end{macrocode} % % \begin{macro}[aux]{\@@_replacement_char:nNN} % The only way to produce an arbitrary character--catcode pair is to % use the \tn{lowercase} or \tn{uppercase} primitives. This is a % wrapper for our purposes. The first argument is the null character % with various catcodes. The second and third arguments are grabbed % from the input stream: |#3| is the character whose character code to % reproduce. % \begin{macrocode} \cs_new_protected:Npn \@@_replacement_char:nNN #1#2#3 { \if_meaning:w \prg_do_nothing: #3 \__msg_kernel_error:nn { regex } { replacement-catcode-end } \else: \tex_lccode:D \c_zero = `#3 \scan_stop: \tl_to_lowercase:n { \__tl_build_one:n {#1} } \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_replacement_c_A:w} % For an active character, expansion must be avoided, twice because we % later do two \texttt{x}-expansions, to unpack \tn{toks} for the % query, and to expand their contents to tokens of the query. % \begin{macrocode} \char_set_catcode_active:N \^^@ \cs_new_protected_nopar:Npn \@@_replacement_c_A:w { \@@_replacement_char:nNN { \exp_not:n { \exp_not:N ^^@ } } } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_replacement_c_B:w} % An explicit begin-group token increases the balance, unless within a % |\c{...}| or |\u{...}| construction. Add the desired begin-group % character, using the standard \cs{if_false:} trick. We eventually % \texttt{x}-expand twice. The first time must yield a balanced token % list, and the second one gives the bare begin-group token. The % \cs{exp_after:wN} is not strictly needed, but is more consistent % with \pkg{l3tl-analysis}. % \begin{macrocode} \char_set_catcode_group_begin:N \^^@ \cs_new_protected_nopar:Npn \@@_replacement_c_B:w { \if_int_compare:w \l_@@_replacement_csnames_int = \c_zero \int_incr:N \l_@@_balance_int \fi: \@@_replacement_char:nNN { \exp_not:n { \exp_after:wN ^^@ \if_false: } \fi: } } } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_replacement_c_C:w} % This is not quite catcode-related: when the user requests a % character with category \enquote{control sequence}, the % one-character control symbol is returned. As for the active % character, we prepare for two \texttt{x}-expansions. % \begin{macrocode} \cs_new_protected:Npn \@@_replacement_c_C:w #1#2 { \__tl_build_one:n { \exp_not:N \exp_not:N \exp_not:c {#2} } } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_replacement_c_D:w} % Subscripts fit the mould: \tn{lowercase} the null byte with the % correct category. % \begin{macrocode} \char_set_catcode_math_subscript:N \^^@ \cs_new_protected_nopar:Npn \@@_replacement_c_D:w { \@@_replacement_char:nNN { ^^@ } } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_replacement_c_E:w} % Similar to the begin-group case, the second \texttt{x}-expansion % produces the bare end-group token. % \begin{macrocode} \char_set_catcode_group_end:N \^^@ \cs_new_protected_nopar:Npn \@@_replacement_c_E:w { \if_int_compare:w \l_@@_replacement_csnames_int = \c_zero \int_decr:N \l_@@_balance_int \fi: \@@_replacement_char:nNN { \exp_not:n { \if_false: { \fi: ^^@ } } } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_replacement_c_L:w} % Simply \tn{lowercase} a letter null byte to produce an arbitrary letter. % \begin{macrocode} \char_set_catcode_letter:N \^^@ \cs_new_protected_nopar:Npn \@@_replacement_c_L:w { \@@_replacement_char:nNN { ^^@ } } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_replacement_c_M:w} % No surprise here, we lowercase the null math toggle. % \begin{macrocode} \char_set_catcode_math_toggle:N \^^@ \cs_new_protected_nopar:Npn \@@_replacement_c_M:w { \@@_replacement_char:nNN { ^^@ } } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_replacement_c_O:w} % Lowercase an other null byte. % \begin{macrocode} \char_set_catcode_other:N \^^@ \cs_new_protected_nopar:Npn \@@_replacement_c_O:w { \@@_replacement_char:nNN { ^^@ } } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_replacement_c_P:w} % For macro parameters, expansion is a tricky issue. We need to % prepare for two \texttt{x}-expansions and passing through various % macro definitions. Note that we cannot replace one \cs{exp_not:n} by % doubling the macro parameter characters because this would misbehave % if a mischievous user asks for |\c{\cP\#}|, since that macro % parameter character would be doubled. % \begin{macrocode} \char_set_catcode_parameter:N \^^@ \cs_new_protected_nopar:Npn \@@_replacement_c_P:w { \@@_replacement_char:nNN { \exp_not:n { \exp_not:n { ^^@^^@^^@^^@ } } } } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_replacement_c_S:w} % Spaces are normalized on input by \TeX{} to have character code % $32$. It is in fact impossible to get a token with character code % $0$ and category code $10$. Hence we use $32$ instead of $0$ as our % base character. % \begin{macrocode} \cs_new_protected:Npn \@@_replacement_c_S:w #1#2 { \if_meaning:w \prg_do_nothing: #2 \__msg_kernel_error:nn { regex } { replacement-catcode-end } \else: \if_int_compare:w `#2 = \c_zero \__msg_kernel_error:nn { regex } { replacement-null-space } \fi: \tex_lccode:D 32 = `#2 \scan_stop: \tl_to_lowercase:n { \__tl_build_one:n {~} } \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_replacement_c_T:w} % No surprise for alignment tabs here. Those are surrounded by the % appropriate braces whenever necessary, hence they don't cause % trouble in alignment settings. % \begin{macrocode} \char_set_catcode_alignment:N \^^@ \cs_new_protected_nopar:Npn \@@_replacement_c_T:w { \@@_replacement_char:nNN { ^^@ } } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_replacement_c_U:w} % Simple call to \cs{@@_replacement_char:nNN} which lowercases the % math superscript |^^@|. % \begin{macrocode} \char_set_catcode_math_superscript:N \^^@ \cs_new_protected_nopar:Npn \@@_replacement_c_U:w { \@@_replacement_char:nNN { ^^@ } } % \end{macrocode} % \end{macro} % % Restore the catcode of the null byte. % \begin{macrocode} \group_end: % \end{macrocode} % % \subsubsection{An error} % % \begin{macro}[aux]{\@@_replacement_error:NNN} % Simple error reporting by calling one of the messages % \texttt{replacement-c}, \texttt{replacement-g}, or % \texttt{replacement-u}. % \begin{macrocode} \cs_new_protected:Npn \@@_replacement_error:NNN #1#2#3 { \__msg_kernel_error:nnx { regex } { replacement-#1 } {#3} #2 #3 } % \end{macrocode} % \end{macro} % % \subsection{User functions} % % \begin{macro}{\regex_new:N} % Before being assigned a sensible value, a regex variable matches % nothing. % \begin{macrocode} \cs_new_protected:Npn \regex_new:N #1 { \cs_new_eq:NN #1 \c_@@_no_match_regex } % \end{macrocode} % \end{macro} % % \begin{macro}{\regex_set:Nn, \regex_gset:Nn, \regex_const:Nn} % Compile, then store the result in the user variable with the % appropriate assignment function. % \begin{macrocode} \cs_new_protected_nopar:Npn \regex_set:Nn #1#2 { \@@_compile:n {#2} \tl_set_eq:NN #1 \l_@@_internal_regex } \cs_new_protected_nopar:Npn \regex_gset:Nn #1#2 { \@@_compile:n {#2} \tl_gset_eq:NN #1 \l_@@_internal_regex } \cs_new_protected_nopar:Npn \regex_const:Nn #1#2 { \@@_compile:n {#2} \tl_const:Nx #1 { \exp_not:o \l_@@_internal_regex } } % \end{macrocode} % \end{macro} % % \begin{macro}{\regex_show:N, \regex_show:n} % User functions: the \texttt{n} variant requires compilation first. % Then show the variable with some appropriate text. The auxiliary % \cs{@@_show:Nx} is defined in a different section. % \begin{macrocode} \cs_new_protected:Npn \regex_show:n #1 { \@@_compile:n {#1} \@@_show:Nx \l_@@_internal_regex { { \tl_to_str:n {#1} } } } \cs_new_protected:Npn \regex_show:N #1 { \@@_show:Nx #1 { variable~\token_to_str:N #1 } } % \end{macrocode} % \end{macro} % % \begin{macro}[TF]{\regex_match:nn, \regex_match:Nn} % Those conditionals are based on a common auxiliary defined % later. Its first argument builds the \textsc{nfa} corresponding to % the regex, and the second argument is the query token list. Once we % have performed the match, convert the resulting boolean to % \cs{prg_return_true:} or \texttt{false}. % \begin{macrocode} \prg_new_protected_conditional:Npnn \regex_match:nn #1#2 { T , F , TF } { \@@_if_match:nn { \@@_build:n {#1} } {#2} \@@_return: } \prg_new_protected_conditional:Npnn \regex_match:Nn #1#2 { T , F , TF } { \@@_if_match:nn { \@@_build:N #1 } {#2} \@@_return: } % \end{macrocode} % \end{macro} % % \begin{macro}{\regex_count:nnN, \regex_count:NnN} % Again, use an auxiliary whose first argument builds the \textsc{nfa}. % \begin{macrocode} \cs_new_protected:Npn \regex_count:nnN #1 { \@@_count:nnN { \@@_build:n {#1} } } \cs_new_protected:Npn \regex_count:NnN #1 { \@@_count:nnN { \@@_build:N #1 } } % \end{macrocode} % \end{macro} % % \begin{macro} % { % \regex_extract_once:nnN, \regex_extract_once:NnN, % \regex_extract_all:nnN, \regex_extract_all:NnN, % \regex_replace_once:nnN, \regex_replace_once:NnN, % \regex_replace_all:nnN, \regex_replace_all:NnN, % \regex_split:nnN, \regex_split:NnN % } % \begin{macro}[TF] % { % \regex_extract_once:nnN, \regex_extract_once:NnN, % \regex_extract_all:nnN, \regex_extract_all:NnN, % \regex_replace_once:nnN, \regex_replace_once:NnN, % \regex_replace_all:nnN, \regex_replace_all:NnN, % \regex_split:nnN, \regex_split:NnN % } % We define here $40$ user functions, following a common pattern in % terms of \texttt{:nnN} auxiliaries, defined in the coming % subsections. The auxiliary is handed \cs{@@_build:n} or % \cs{@@_build:N} with the appropriate regex argument, then all % other necessary arguments (replacement text, token list, \emph{etc.} % The conditionals call \cs{@@_return:} to return either % \texttt{true} or \texttt{false} once matching has been performed. % \begin{macrocode} \cs_set_protected:Npn \@@_tmp:w #1#2#3 { \cs_new_protected:Npn #2 ##1 { #1 { \@@_build:n {##1} } } \cs_new_protected:Npn #3 ##1 { #1 { \@@_build:N ##1 } } \prg_new_protected_conditional:Npnn #2 ##1##2##3 { T , F , TF } { #1 { \@@_build:n {##1} } {##2} ##3 \@@_return: } \prg_new_protected_conditional:Npnn #3 ##1##2##3 { T , F , TF } { #1 { \@@_build:N ##1 } {##2} ##3 \@@_return: } } \@@_tmp:w \@@_extract_once:nnN \regex_extract_once:nnN \regex_extract_once:NnN \@@_tmp:w \@@_extract_all:nnN \regex_extract_all:nnN \regex_extract_all:NnN \@@_tmp:w \@@_replace_once:nnN \regex_replace_once:nnN \regex_replace_once:NnN \@@_tmp:w \@@_replace_all:nnN \regex_replace_all:nnN \regex_replace_all:NnN \@@_tmp:w \@@_split:nnN \regex_split:nnN \regex_split:NnN % \end{macrocode} % \end{macro} % \end{macro} % % \subsubsection{Variables and helpers for user functions} % % \begin{variable}{\l_@@_match_count_int} % The number of matches found so far is stored % in \cs{l_@@_match_count_int}. This is only used % in the \cs{regex_count:nnN} functions. % \begin{macrocode} \int_new:N \l_@@_match_count_int % \end{macrocode} % \end{variable} % % \begin{variable}{@@_begin, @@_end} % Those flags are raised to indicate extra begin-group % or end-group tokens when extracting submatches. % \begin{macrocode} \flag_new:n { @@_begin } \flag_new:n { @@_end } % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_submatch_int, \l_@@_zeroth_submatch_int} % The end-points of each submatch are stored as main and stretch % components of \tn{skip}\meta{submatch}, where \meta{submatch} ranges % from \cs{l_@@_max_state_int} (inclusive) to % \cs{l_@@_submatch_int} (exclusive). Each successful match comes % with a $0$-th submatch (the full match), and one match for each % capturing group: submatches corresponding to the last successful % match are labelled starting at % \texttt{zeroth_submatch}. Additionally, the shrink component of this % $0$-th submatch is the position at which that match attempt started: % this is used for splitting and replacements. % \begin{macrocode} \int_new:N \l_@@_submatch_int \int_new:N \l_@@_zeroth_submatch_int % \end{macrocode} % \end{variable} % % \begin{macro}[aux]{\@@_return:} % This function triggers either \cs{prg_return_false:} or % \cs{prg_return_true:} as appropriate to whether a match was found or % not. It is used by all user conditionals. % \begin{macrocode} \cs_new_protected_nopar:Npn \@@_return: { \if_meaning:w \c_true_bool \g_@@_success_bool \prg_return_true: \else: \prg_return_false: \fi: } % \end{macrocode} % \end{macro} % % \subsubsection{Matching} % % \begin{macro}[aux]{\@@_if_match:nn} % We don't track submatches, and stop after a single match. Build the % \textsc{nfa} with |#1|, and perform the match on the query |#2|. % \begin{macrocode} \cs_new_protected:Npn \@@_if_match:nn #1#2 { \group_begin: \@@_disable_submatches: \@@_single_match: #1 \@@_match:n {#2} \group_end: } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_count:nnN} % Again, we don't care about submatches. Instead of aborting after the % first \enquote{longest match} is found, we search for multiple % matches, incrementing \cs{l_@@_match_count_int} every time to % record the number of matches. Build the \textsc{nfa} and match. At % the end, store the result in the user's variable. % \begin{macrocode} \cs_new_protected:Npn \@@_count:nnN #1#2#3 { \group_begin: \@@_disable_submatches: \int_zero:N \l_@@_match_count_int \@@_multi_match:n { \int_incr:N \l_@@_match_count_int } #1 \@@_match:n {#2} \exp_args:NNNo \group_end: \int_set:Nn #3 { \int_use:N \l_@@_match_count_int } } % \end{macrocode} % \end{macro} % % \subsubsection{Extracting submatches} % % \begin{macro}[aux]{\@@_extract_once:nnN, \@@_extract_all:nnN} % Match once or multiple times. After each match (or after the only % match), extract the submatches using \cs{@@_extract:}. At the % end, store the sequence containing all the submatches into the user % variable |#3| after closing the group. % \begin{macrocode} \cs_new_protected:Npn \@@_extract_once:nnN #1#2#3 { \group_begin: \@@_single_match: #1 \@@_match:n {#2} \@@_extract: \@@_group_end_extract_seq:N #3 } \cs_new_protected:Npn \@@_extract_all:nnN #1#2#3 { \group_begin: \@@_multi_match:n { \@@_extract: } #1 \@@_match:n {#2} \@@_group_end_extract_seq:N #3 } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_split:nnN} % Splitting at submatches is a bit more tricky. For each match, % extract all submatches, and replace the zeroth submatch by the part % of the query between the start of the match attempt and the start of % the zeroth submatch. This is inhibited if the delimiter matched an % empty token list at the start of this match attempt. After the last % match, store the last part of the token list, which ranges from the % start of the match attempt to the end of the query. This step is % inhibited if the last match was empty and at the very end: decrement % \cs{l_@@_submatch_int}, which controls which \tn{skip} registers % will be used. % \begin{macrocode} \cs_new_protected:Npn \@@_split:nnN #1#2#3 { \group_begin: \@@_multi_match:n { \if_int_compare:w \l_@@_start_pos_int < \l_@@_success_pos_int \@@_extract: \tex_skip:D \l_@@_zeroth_submatch_int = \l_@@_start_pos_int sp plus \tex_skip:D \l_@@_zeroth_submatch_int \scan_stop: \fi: } #1 \@@_match:n {#2} %\assert_int:n { \l_@@_current_pos_int = \l_@@_max_pos_int } \tex_skip:D \l_@@_submatch_int = \l_@@_start_pos_int sp plus \l_@@_max_pos_int sp \scan_stop: \int_incr:N \l_@@_submatch_int \if_meaning:w \c_true_bool \l_@@_empty_success_bool \if_int_compare:w \l_@@_start_pos_int = \l_@@_max_pos_int \int_decr:N \l_@@_submatch_int \fi: \fi: \@@_group_end_extract_seq:N #3 } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_group_end_extract_seq:N} % The end-points of submatches are stored as the main and stretch % components of \tn{skip} registers from \cs{l_@@_max_state_int} to % \cs{l_@@_submatch_int} (exclusive). Extract the relevant ranges % into \cs{l_@@_internal_a_tl}. We detect unbalanced results using % the two flags \texttt{@@_begin} and \texttt{@@_end}, raised % whenever we see too many begin-group or end-group tokens in a % submatch. We disable \cs{__seq_item:n} to prevent two % \texttt{x}-expansions. % \begin{macrocode} \cs_new_protected:Npn \@@_group_end_extract_seq:N #1 { \cs_set_eq:NN \__seq_item:n \scan_stop: \flag_clear:n { @@_begin } \flag_clear:n { @@_end } \tl_set:Nx \l_@@_internal_a_tl { \s__seq \int_step_function:nnnN { \c_two * \l_@@_max_state_int } \c_one { \l_@@_submatch_int - \c_one } \@@_extract_seq_aux:n } \int_compare:nNnF { \flag_height:n { @@_begin } + \flag_height:n { @@_end } } = \c_zero { \__msg_kernel_error:nnxxx { regex } { result-unbalanced } { splitting~or~extracting~submatches } { \flag_height:n { @@_end } } { \flag_height:n { @@_begin } } } \use:x { \group_end: \tl_set:Nn \exp_not:N #1 { \l_@@_internal_a_tl } } } % \end{macrocode} % \end{macro} % % \begin{macro}[aux, EXP]{\@@_extract_seq_aux:n, \@@_extract_seq_aux:ww} % The \texttt{:n} auxiliary builds one item of the sequence of % submatches. First compute the brace balance of the submatch, then % extract the submatch from the query, adding the appropriate braces % and raising a flag if the submatch is not balanced. % \begin{macrocode} \cs_new:Npn \@@_extract_seq_aux:n #1 { \__seq_item:n { \exp_after:wN \@@_extract_seq_aux:ww \__int_value:w \@@_submatch_balance:n {#1} ; #1; } } \cs_new:Npn \@@_extract_seq_aux:ww #1; #2; { \if_int_compare:w #1 < \c_zero \flag_raise:n { @@_end } \prg_replicate:nn {-#1} { \exp_not:n { { \if_false: } \fi: } } \fi: \@@_query_submatch:n {#2} \if_int_compare:w #1 > \c_zero \flag_raise:n { @@_begin } \prg_replicate:nn {#1} { \exp_not:n { \if_false: { \fi: } } } \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}[aux] % {\@@_extract:, \@@_extract_b:wn, \@@_extract_e:wn} % Our task here is to extract from the property list % \cs{l_@@_success_submatches_prop} the list of end-points of % submatches, and store them in \tn{skip} registers, from % \cs{l_@@_zeroth_submatch_int} upwards. We begin by emptying those % \tn{skip} registers. Then for each \meta{key}--\meta{value} pair in % the property list update the appropriate \tn{skip} component. This % is somewhat a hack: the \meta{key} is a non-negative integer % followed by |<| or |>|, which we use in a comparison to $-1$. At the % end, store the information about the position at which the match % attempt started, as a shrink component. % \begin{macrocode} \cs_new_protected_nopar:Npn \@@_extract: { \if_meaning:w \c_true_bool \g_@@_success_bool \int_set_eq:NN \l_@@_zeroth_submatch_int \l_@@_submatch_int \prg_replicate:nn \l_@@_capturing_group_int { \tex_skip:D \l_@@_submatch_int \c_zero sp \scan_stop: \int_incr:N \l_@@_submatch_int } \prop_map_inline:Nn \l_@@_success_submatches_prop { \if_int_compare:w ##1 \c_minus_one \exp_after:wN \@@_extract_e:wn \__int_value:w \else: \exp_after:wN \@@_extract_b:wn \__int_value:w \fi: \__int_eval:w \l_@@_zeroth_submatch_int + ##1 {##2} } \tex_skip:D \l_@@_zeroth_submatch_int = \tex_the:D \tex_skip:D \l_@@_zeroth_submatch_int minus \l_@@_start_pos_int sp \scan_stop: \fi: } \cs_new_protected:Npn \@@_extract_b:wn #1 < #2 { \tex_skip:D #1 = #2 sp plus \etex_gluestretch:D \tex_skip:D #1 \scan_stop: } \cs_new_protected:Npn \@@_extract_e:wn #1 > #2 { \tex_skip:D #1 = 1 \tex_skip:D #1 plus #2 sp \scan_stop: } % \end{macrocode} % \end{macro} % % \subsubsection{Replacement} % % \begin{macro}[aux]{\@@_replace_once:nnN} % Build the \textsc{nfa} and the replacement functions, then find a % single match. If the match failed, simply exit the % group. Otherwise, we do the replacement. Extract submatches. Compute % the brace balance corresponding to replacing this match by the % replacement (this depends on submatches). Prepare the replaced token % list: the replacement function produces the tokens from the start of % the query to the start of the match and the replacement text for % this match; we need to add the tokens from the end of the match to % the end of the query. Finally, store the result in the user's % variable after closing the group: this step involves an additional % \texttt{x}-expansion, and checks that braces are balanced in the % final result. % \begin{macrocode} \cs_new_protected:Npn \@@_replace_once:nnN #1#2#3 { \group_begin: \@@_single_match: #1 \@@_replacement:n {#2} \exp_args:No \@@_match:n { #3 } \if_meaning:w \c_false_bool \g_@@_success_bool \group_end: \else: \@@_extract: \int_set:Nn \l_@@_balance_int { \@@_replacement_balance_one_match:n { \l_@@_zeroth_submatch_int } } \tl_set:Nx \l_@@_internal_a_tl { \@@_replacement_do_one_match:n { \l_@@_zeroth_submatch_int } \@@_query_range:nn { \etex_gluestretch:D \tex_skip:D \l_@@_zeroth_submatch_int } { \l_@@_max_pos_int } } \@@_group_end_replace:N #3 \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_replace_all:nnN} % Match multiple times, and for every match, extract submatches and % additionally store the position at which the match attempt started % (as the shrink component of a \tn{skip} register). The \tn{skip} % registers from \cs{l_@@_max_state_int} to % \cs{l_@@_submatch_int} hold information about submatches of every % match in order; each match corresponds to % \cs{l_@@_capturing_group_int} consecutive \tn{skip} registers. % Compute the brace balance corresponding to doing all the % replacements: this is the sum of brace balances for replacing each % match. Join together the replacement texts for each match (including % the part of the query before the match), and the end of the query. % \begin{macrocode} \cs_new_protected:Npn \@@_replace_all:nnN #1#2#3 { \group_begin: \@@_multi_match:n { \@@_extract: } #1 \@@_replacement:n {#2} \exp_args:No \@@_match:n {#3} \int_set:Nn \l_@@_balance_int { 0 \int_step_function:nnnN { \c_two * \l_@@_max_state_int } \l_@@_capturing_group_int { \l_@@_submatch_int - \c_one } \@@_replacement_balance_one_match:n } \tl_set:Nx \l_@@_internal_a_tl { \int_step_function:nnnN { \c_two * \l_@@_max_state_int } \l_@@_capturing_group_int { \l_@@_submatch_int - \c_one } \@@_replacement_do_one_match:n \@@_query_range:nn \l_@@_start_pos_int \l_@@_max_pos_int } \@@_group_end_replace:N #3 } % \end{macrocode} % \end{macro} % % \begin{macro}[aux]{\@@_group_end_replace:N} % If the brace balance is not $0$, raise an error. Then set the user's % variable |#1| to the \texttt{x}-expansion of % \cs{l_@@_internal_a_tl}, adding the appropriate braces to produce % a balanced result. And end the group. % \begin{macrocode} \cs_new_protected_nopar:Npn \@@_group_end_replace:N #1 { \if_int_compare:w \l_@@_balance_int = \c_zero \else: \__msg_kernel_error:nnxxx { regex } { result-unbalanced } { replacing } { \int_max:nn { - \l_@@_balance_int } { \c_zero } } { \int_max:nn { \l_@@_balance_int } { \c_zero } } \fi: \use:x { \group_end: \tl_set:Nn \exp_not:N #1 { \if_int_compare:w \l_@@_balance_int < \c_zero \prg_replicate:nn { - \l_@@_balance_int } { { \if_false: } \fi: } \fi: \l_@@_internal_a_tl \if_int_compare:w \l_@@_balance_int > \c_zero \prg_replicate:nn { \l_@@_balance_int } { \if_false: { \fi: } } \fi: } } } % \end{macrocode} % \end{macro} % % \subsubsection{Storing and showing compiled patterns} % % \subsection{Messages} % % Messages for the preparsing phase. % \begin{macrocode} \__msg_kernel_new:nnnn { regex } { trailing-backslash } { Trailing~escape~character~`\iow_char:N\\'. } { A~regular~expression~or~its~replacement~text~ends~with~ the~escape~character~`\iow_char:N\\'.~It~will~be~ignored. } \__msg_kernel_new:nnnn { regex } { x-missing-rbrace } { Missing~closing~brace~in~`\iow_char:N\\x'~hexadecimal~sequence. } { You~wrote~something~like~ `\iow_char:N\\x\{...#1'.~ The~closing~brace~is~missing. } \__msg_kernel_new:nnnn { regex } { x-overflow } { Character~code~`#1'~too~large~in~`\iow_char:N\\x'~hexadecimal~sequence. } { You~wrote~something~like~ `\iow_char:N\\x\{\int_to_hexadecimal:n{#1}\}'.~ The~character~code~#1~is~larger~than~ the~maximum~value~\int_use:N \c_max_char_int. } % \end{macrocode} % % Invalid quantifier. % \begin{macrocode} \__msg_kernel_new:nnnn { regex } { invalid-quantifier } { Braced~quantifier~`#1'~may~not~be~followed~by~`#2'. } { The~character~`#2'~is~invalid~in~the~braced~quantifier~`#1'.~ The~only~valid~quantifiers~are~`*',~`?',~`+',~`{}',~ `{,}'~and~`{,}',~optionally~followed~by~`?'. } % \end{macrocode} % % Messages for missing or extra closing brackets and parentheses, with % some fancy singular/plural handling for the case of parentheses. % \begin{macrocode} \__msg_kernel_new:nnnn { regex } { missing-rbrack } { Missing~right~bracket~inserted~in~regular~expression. } { LaTeX~was~given~a~regular~expression~where~a~character~class~ was~started~with~`[',~but~the~matching~`]'~is~missing. } \__msg_kernel_new:nnnn { regex } { missing-rparen } { Missing~right~ \int_compare:nTF { #1 = 1 } { parenthesis } { parentheses } ~ inserted~in~regular~expression. } { LaTeX~was~given~a~regular~expression~with~\int_eval:n {#1} ~ more~left~parentheses~than~right~parentheses. } \__msg_kernel_new:nnnn { regex } { extra-rparen } { Extra~right~parenthesis~ignored~in~regular~expression. } { LaTeX~came~across~a~closing~parenthesis~when~no~submatch~group~ was~open.~The~parenthesis~will~be~ignored. } % \end{macrocode} % % Some escaped alphanumerics are not allowed everywhere. % \begin{macrocode} \__msg_kernel_new:nnnn { regex } { bad-escape } { Invalid~escape~`\iow_char:N\\#1'~ \@@_if_in_cs:TF { within~a~control~sequence. } { \@@_if_in_class:TF { in~a~character~class. } { following~a~category~test. } } } { The~escape~sequence~`\iow_char:N\\#1'~may~not~appear~ \@@_if_in_cs:TF { within~a~control~sequence~test~introduced~by~ `\iow_char:N\\c\iow_char:N\{'. } { \@@_if_in_class:TF { within~a~character~class~ } { following~a~category~test~such~as~`\iow_char:N\\cL'~ } because~it~does~not~match~exactly~one~character. } } % \end{macrocode} % % Range errors. % \begin{macrocode} \__msg_kernel_new:nnnn { regex } { range-missing-end } { Invalid~end-point~for~range~`#1-#2'~in~character~class. } { The~end-point~`#2'~of~the~range~`#1-#2'~may~not~serve~as~an~ end-point~for~a~range:~alphanumeric~characters~should~not~be~ escaped,~and~non-alphanumeric~characters~should~be~escaped. } \__msg_kernel_new:nnnn { regex } { range-backwards } { Range~`[#1-#2]'~out~of~order~in~character~class. } { In~ranges~of~characters~`[x-y]'~appearing~in~character~classes,~ the~first~character~code~must~not~be~larger~than~the~second.~ Here,~`#1'~has~character~code~\int_eval:n {`#1},~while~ `#2'~has~character~code~\int_eval:n {`#2}. } % \end{macrocode} % % Errors related to |\c| and |\u|. % \begin{macrocode} \__msg_kernel_new:nnnn { regex } { c-bad-mode } { Invalid~nested~`\iow_char:N\\c'~escape~in~regular~expression. } { The~`\iow_char:N\\c'~escape~cannot~be~used~within~ a~control~sequence~test~`\iow_char:N\\c{...}'.~ To~combine~several~category~tests,~use~`\iow_char:N\\c[...]'. } \__msg_kernel_new:nnnn { regex } { c-missing-rbrace } { Missing~right~brace~inserted~for~`\iow_char:N\\c'~escape. } { LaTeX~was~given~a~regular~expression~where~a~ `\iow_char:N\\c\iow_char:N\{...'~construction~was~not~ended~ with~a~closing~brace~`\iow_char:N\}'. } \__msg_kernel_new:nnnn { regex } { c-missing-rbrack } { Missing~right~bracket~inserted~for~`\iow_char:N\\c'~escape. } { A~construction~`\iow_char:N\\c[...'~appears~in~a~ regular~expression,~but~the~closing~`]'~is~not~present. } \__msg_kernel_new:nnnn { regex } { c-missing-category } { Invalid~character~`#1'~following~`\iow_char:N\\c'~escape. } { In~regular~expressions,~the~`\iow_char:N\\c'~escape~sequence~ may~only~be~followed~by~a~left~brace,~a~left~bracket,~or~a~ capital~letter~representing~a~character~category,~namely~ one~of~`ABCDELMOPSTU'. } \__msg_kernel_new:nnnn { regex } { c-trailing } { Trailing~category~code~escape~`\iow_char:N\\c'... } { A~regular~expression~ends~with~`\iow_char:N\\c'~followed~ by~a~letter.~It~will~be~ignored. } \__msg_kernel_new:nnnn { regex } { u-missing-lbrace } { Missing~left~brace~following~`\iow_char:N\\u'~escape. } { The~`\iow_char:N\\u'~escape~sequence~must~be~followed~by~ a~brace~group~with~the~name~of~the~variable~to~use. } \__msg_kernel_new:nnnn { regex } { u-missing-rbrace } { Missing~right~brace~inserted~for~`\iow_char:N\\u'~escape. } { LaTeX~ \str_if_eq_x:nnTF { } {#2} { reached~the~end~of~the~string~ } { encountered~an~escaped~alphanumeric~character `\iow_char:N\\#2'~ } when~parsing~the~argument~of~an~`\iow_char:N\\u\iow_char:N\{...\}'~escape. } % \end{macrocode} % % Errors when encountering the \textsc{posix} syntax |[:...:]|. % \begin{macrocode} \__msg_kernel_new:nnnn { regex } { posix-unsupported } { POSIX~collating~element~`[#1 ~ #1]'~not~supported. } { The~`[.foo.]'~and~`[=bar=]'~syntaxes~have~a~special~meaning~ in~POSIX~regular~expressions.~This~is~not~supported~by~LaTeX.~ Maybe~you~forgot~to~escape~a~left~bracket~in~a~character~class? } \__msg_kernel_new:nnnn { regex } { posix-unknown } { POSIX~class~`[:#1:]'~unknown. } { `[:#1:]'~is~not~among~the~known~POSIX~classes~ `[:alnum:]',~`[:alpha:]',~`[:ascii:]',~`[:blank:]',~ `[:cntrl:]',~`[:digit:]',~`[:graph:]',~`[:lower:]',~ `[:print:]',~`[:punct:]',~`[:space:]',~`[:upper:]',~ `[:word:]',~and~`[:xdigit:]'. } \__msg_kernel_new:nnnn { regex } { posix-missing-close } { Missing~closing~`:]'~for~POSIX~class. } { The~POSIX~syntax~`#1'~must~be~followed~by~`:]',~not~`#2'. } % \end{macrocode} % % In various cases, the result of a \pkg{l3regex} operation can leave us % with an unbalanced token list, which we must re-balance by adding % begin-group or end-group character tokens. % \begin{macrocode} \__msg_kernel_new:nnnn { regex } { result-unbalanced } { Missing~brace~inserted~when~#1. } { LaTeX~was~asked~to~do~some~regular~expression~operation,~ and~the~resulting~token~list~would~not~have~the~same~number~ of~begin-group~and~end-group~tokens.~Braces~were~inserted:~ #2~left,~#3~right. } % \end{macrocode} % % Error message for unknown options. % \begin{macrocode} \__msg_kernel_new:nnnn { regex } { unknown-option } { Unknown~option~`#1'~for~regular~expressions. } { The~only~available~option~is~`case-insensitive',~toggled~by~ `(?i)'~and~`(?-i)'. } \__msg_kernel_new:nnnn { regex } { special-group-unknown } { Unknown~special~group~`#1~...'~in~a~regular~expression. } { The~only~valid~constructions~starting~with~`(?'~are~ `(?:~...~)',~`(?|~...~)',~`(?i)',~and~`(?-i)'. } % \end{macrocode} % % Errors in the replacement text. % \begin{macrocode} \__msg_kernel_new:nnnn { regex } { replacement-c } { Misused~`\iow_char:N\\c'~command~in~a~replacement~text. } { In~a~replacement~text,~the~`\iow_char:N\\c'~escape~sequence~ can~be~followed~by~one~of~the~letters~`ABCDELMOPSTU'~ or~a~brace~group,~not~by~`#1'. } \__msg_kernel_new:nnnn { regex } { replacement-u } { Misused~`\iow_char:N\\u'~command~in~a~replacement~text. } { In~a~replacement~text,~the~`\iow_char:N\\u'~escape~sequence~ must~be~~followed~by~a~brace~group~holding~the~name~of~the~ variable~to~use. } \__msg_kernel_new:nnnn { regex } { replacement-g } { Missing~brace~for~the~`\iow_char:N\\g'~construction~ in~a~replacement~text. } { In~the~replacement~text~for~a~regular~expression~search,~ submatches~are~represented~either~as~`\iow_char:N \\g{dd..d}',~ or~`\\d',~where~`d'~are~single~digits.~Here,~a~brace~is~missing. } \__msg_kernel_new:nnnn { regex } { replacement-catcode-end } { Missing~character~for~the~`\iow_char:N\\c'~ construction~in~a~replacement~text. } { In~a~replacement~text,~the~`\iow_char:N\\c'~escape~sequence~ can~be~followed~by~one~of~the~letters~`ABCDELMOPSTU'~representing~ the~character~category.~Then,~a~character~must~follow.~LaTeX~ reached~the~end~of~the~replacement~when~looking~for~that. } \__msg_kernel_new:nnnn { regex } { replacement-null-space } { TeX~cannot~build~a~space~token~with~character~code~0. } { You~asked~for~a~character~token~with~category~space,~ and~character~code~0,~for~instance~through~ `\iow_char:N\\cS\iow_char:N\\x00'.~ This~specific~case~is~impossible~and~will~be~replaced~ by~a~normal~space. } \__msg_kernel_new:nnnn { regex } { replacement-missing-rbrace } { Missing~right~brace~inserted~in~replacement~text. } { There~ \int_compare:nTF { #1 = 1 } { was } { were } ~ #1~ missing~right~\int_compare:nTF { #1 = 1 } { brace } { braces } . } % \end{macrocode} % % \begin{macro}[aux]{\@@_msg_repeated:nnN} % This is not technically a message, but seems related enough to go % there. The arguments are: |#1| is the minimum number of repetitions; % |#2| is the number of allowed extra repetitions ($-1$ for infinite % number), and |#3| tells us about lazyness. % \begin{macrocode} \cs_new:Npn \@@_msg_repeated:nnN #1#2#3 { \str_if_eq_x:nnF { #1 #2 } { 1 0 } { , ~ repeated ~ \int_case:nnF {#2} { { -1 } { #1~or~more~times,~\bool_if:NTF #3 { lazy } { greedy } } { 0 } { #1~times } } { between~#1~and~\int_eval:n {#1+#2}~times,~ \bool_if:NTF #3 { lazy } { greedy } } } } % \end{macrocode} % \end{macro} % % \subsection{Code for tracing} % % The tracing code is still very experimental, and is meant to be used % with the \pkg{l3trace} package, currently in \texttt{l3trial}. % % \begin{macro}[int]{\@@_trace_states:n} % This function lists the contents of all states of the \textsc{nfa}, % stored in \tn{toks} from $0$ to \cs{l_@@_max_state_int} % (excluded). % \begin{macrocode} %<*trace> \cs_new_protected:Npn \@@_trace_states:n #1 { \int_step_inline:nnnn \l_@@_min_state_int \c_one { \l_@@_max_state_int - 1 } { \trace:nnx { regex } { #1 } { \iow_char:N \\toks ##1 = { \tex_the:D \tex_toks:D ##1 } } } } % % \end{macrocode} % \end{macro} % % \begin{macrocode} % % \end{macrocode} % % \end{implementation} % % \PrintIndex % \endinput %^^A NOT IMPLEMENTED %^^A \p{xx} a character with the xx property %^^A \P{xx} a character without the xx property %^^A [[:xxx:]] positive POSIX named set %^^A [[:^xxx:]] negative POSIX named set %^^A (?=...) positive look ahead %^^A (?!...) negative look ahead %^^A (?<=...) positive look behind %^^A (?...) or (?'name'...) or (?P...) %^^A named capturing group %^^A \R a newline sequence %^^A \X an extended Unicode sequence %^^A (?C) or (?Cn) callout with data n %^^A (?R) recurse whole pattern %^^A (?[+-]n) or \g<[+-]n> or (?&name) or (?P>name) or \g %^^A call subpattern %^^A (?([+-]n)... or (?()... %^^A reference condition %^^A (?(R)... or (?(Rn)... or (?(R&name)... %^^A recursion condition %^^A (?(DEFINE)... define subpattern for reference %^^A (?(assert)... assertion condition %^^A (*ACCEPT) force successful match %^^A (*FAIL) force backtrack; synonym (*F) %^^A (*COMMIT) overall failure, no advance of starting point %^^A (*PRUNE) advance to next starting character %^^A (*SKIP) advance start to current matching position %^^A (*THEN) local failure, backtrack to next alternation %^^A (*CR) or (*LF) or (*CRLF) or (*ANYCRLF) or (*ANY) %^^A newline convention %^^A (*BSR_ANYCRLF) or (*BSR_UNICODE) %^^A change what \R matches. %^^A %^^A \cx "control-x", where x is any ASCII character %^^A \C one byte, even in UTF-8 mode (best avoided) %^^A + possessive quantifiers %^^A (?>...) atomic, non-capturing group %^^A (?#....) comment (not nestable) %^^A (?JmsUx) options (duplicate names; multiline; single line; %^^A ungreedy; extended) %^^A (*NO_START_OPT) no start-match optimization (PCRE_NO_START_OPTIMIZE) %^^A (*UTF8) set UTF-8 mode (PCRE_UTF8) %^^A (*UCP) set PCRE_UCP (use Unicode properties for \d etc) %^^A \n or \gn or \g{[-]n} or \g{name} or (?P=name) %^^A or \k or \k'name' or \k{name} %^^A back-references