pcre.pl

View source with formatted comments or as raw
    1/*  Part of SWI-Prolog
    2
    3    Author:        Jan Wielemaker and Peter Ludemann
    4    E-mail:        jan@swi-prolog.org
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (c)  2017-2023, VU University Amsterdam
    7                              SWI-Prolog Solutions b.v.
    8    All rights reserved.
    9
   10    Redistribution and use in source and binary forms, with or without
   11    modification, are permitted provided that the following conditions
   12    are met:
   13
   14    1. Redistributions of source code must retain the above copyright
   15       notice, this list of conditions and the following disclaimer.
   16
   17    2. Redistributions in binary form must reproduce the above copyright
   18       notice, this list of conditions and the following disclaimer in
   19       the documentation and/or other materials provided with the
   20       distribution.
   21
   22    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   23    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   24    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   25    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   26    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   27    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   28    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   29    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   30    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   31    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   32    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   33    POSSIBILITY OF SUCH DAMAGE.
   34*/
   35
   36:- module(pcre,
   37          [ re_match/2,           % +Regex, +String
   38            re_match/3,           % +Regex, +String, +Options
   39            re_matchsub/3,        % +Regex, +String, -Subs
   40            re_matchsub/4,        % +Regex, +String, -Subs, +Options
   41            re_foldl/6,           % :Goal, +Regex, +String, ?V0, ?V, +Options
   42            re_split/3,           % +Pattern, +String, -Split:list
   43            re_split/4,           % +Pattern, +String, -Split:list, +Options
   44            re_replace/4,         % +Pattern, +With, +String, -NewString
   45            re_replace/5,         % +Pattern, +With, +String, -NewString, +Options
   46            re_compile/3,         % +Pattern, -Regex, +Options
   47            re_flush/0,
   48            re_config/1           % ?Config
   49          ]).   50:- autoload(library(apply), [maplist/2, maplist/3]).   51:- autoload(library(error), [must_be/2, existence_error/2]).   52:- autoload(library(dcg/basics), [eos/2, digit/3, digits/3]).   53:- autoload(library(lists), [append/3]).   54
   55:- use_foreign_library(foreign(pcre4pl)).   56
   57:- meta_predicate
   58    re_foldl(3, +, +, ?, ?, +).   59
   60/** <module> Perl compatible regular expression matching for SWI-Prolog
   61
   62This module provides an interface  to  the [PCRE2](http://www.pcre.org/)
   63(Perl Compatible Regular Expression)  library.   This  Prolog  interface
   64provides an almost complete wrapper around PCRE2  (the successor to
   65PCRE)  with as much backward compatibility to PCRE as possible,  because
   66the original implementation was for PCRE (also known as PCRE1).
   67
   68Regular  expressions  are  created  from   a  pattern  and  options  and
   69represented as  a SWI-Prolog _blob_.   This implies they are  subject to
   70(atom) garbage  collection. Compiled  regular expressions can  safely be
   71used  in multiple  threads. Most  predicates accept  both an  explicitly
   72compiled regular  expression, a pattern,  or a term  Pattern/Flags.  The
   73semantics of the pattern can be additionally modified by options. In the
   74latter two cases a regular expression  _blob_ is created and stored in a
   75cache. The cache can be cleared using re_flush/0.
   76
   77Most of the predicates in this library   take  both a regular expression
   78represented as a string  with  optional   flags,  e.g.,  `'aap'/i`  or a
   79_compiled regular_ expression. If a string (+flags) alternative is used,
   80the library maintains a cache of  compiled regular expressions. See also
   81re_flush/0. The library can  be  asked   to  rewrite  the re_match/2 and
   82re_match/3 goals to use  inlined   compiled  regular  expression objects
   83using
   84
   85    :- set_prolog_flag(re_compile, true).
   86
   87This has some consequences:
   88
   89  - Performance is considerable better.
   90  - Compiled regular expressions are currently incompatible with
   91    _Quick Load Files_ (`.qlf`, see qcompile/1) and _Saved States_
   92    (see qsave_program/2 and the ``-c`` command line option.
   93  - Debugging may be harder.
   94
   95@see `man pcre2api` or https://www.pcre.org/current/doc/html/pcre2api.html
   96     for details of the PCRE2 syntax and options.
   97*/
   98
   99:- predicate_options(re_match/3, 3,
  100                     [ start(integer), % Not part of pcre2 API
  101                       % These are in the same order as in pcre4pl.c, to make it easy to compare them
  102                       anchored(boolean),    % Also re_compile/3
  103                       utf_check(boolean),   % Also re_compile/3
  104                       endanchored(boolean), % Also re_compile/3
  105                       bol(boolean),
  106                       eol(boolean),
  107                       empty(boolean),
  108                       empty_atstart(boolean),
  109                       partial_soft(boolean),
  110                       partial_hard(boolean),
  111                       % dfa_restart(boolean),  % TODO: if pcre2_dfa_match() is supported
  112                       % dfa_shortest(boolean), % TODO: if pcre2_dfa_match() is supported
  113                       jit(boolean),
  114                       copy_matched_subject(boolean)
  115                     ]).  116:- predicate_options(re_compile/3, 3,
  117                     [ capture_type(oneof([atom,string,range])), % Not part of pcre2 API
  118                       % These are in the same order as in pcre4pl.c, to make it easy to compare them
  119                       anchored(boolean),    % Also re_match/3
  120                       utf_check(boolean),   % Also re_match/3
  121                       endanchored(boolean), % Also re_match/3
  122                       allow_empty_class(boolean),
  123                       alt_bsux(boolean),
  124                       auto_callout(boolean),
  125                       caseless(boolean),
  126                       dollar_endonly(boolean),
  127                       dotall(boolean),
  128                       dupnames(boolean),
  129                       extended(boolean),
  130                       firstline(boolean),
  131                       match_unset_backref(boolean),
  132                       multiline(boolean),
  133                       never_ucp(boolean),
  134                       never_utf(boolean),
  135                       auto_capture(boolean),
  136                       no_auto_capture(boolean), % backwards compatibility
  137                       auto_possess(boolean),
  138                       dotstar_anchor(boolean),
  139                       start_optimize(boolean),
  140                       ucp(boolean),
  141                       greedy(boolean),
  142                       ungreedy(boolean), % Backwards compatibility
  143                       utf(boolean),
  144                       never_backslash_c(boolean),
  145                       alt_circumflex(boolean),
  146                       alt_verbnames(boolean),
  147                       use_offset_limit(boolean),
  148                       extended_more(boolean),
  149                       literal(boolean),
  150                       match_invalid_utf(boolean),
  151                       jit_complete(boolean),
  152                       jit_partial_soft(boolean),
  153                       jit_partial_hard(boolean),
  154                       jit_invalid_utf(boolean),
  155                       bsr(oneof([anycrlf,unicode])),
  156                       bsr2(oneof([anycrlf,unicode])),
  157                       compat(oneof([])), % Obsolete
  158                       newline(oneof([any,anycrlf,cr,lf,crlf,nul])),
  159                       newline2(oneof([any,anycrlf,cr,lf,crlf,nul]))
  160                     ]).  161:- predicate_options(re_matchsub/4, 4,
  162                     [ pass_to(re_match/3, 3)
  163                     ]).  164:- predicate_options(re_foldl/6, 6,
  165                     [ pass_to(re_match/3, 3)
  166                     ]).  167:- predicate_options(re_split/4, 4,
  168                     [ pass_to(re_match/3, 3)
  169                     ]).  170:- predicate_options(re_replace/5, 5,
  171                     [ pass_to(re_match/3, 3)
  172                     ]).  173
  174%!  re_match(+Regex, +String) is semidet.
  175%!  re_match(+Regex, +String, +Options) is semidet.
  176%
  177%   Succeeds if String matches Regex.  For example:
  178%
  179%     ```
  180%     ?- re_match("^needle"/i, "Needle in a haystack").
  181%     true.
  182%     ```
  183%
  184%   Defined  Options  are  given  below.   For  details,  see  the  PCRE
  185%   documentation.  If  an option is  repeated, the first value  is used
  186%   and  subsequent  values  are   ignored.   Unrecognized  options  are
  187%   ignored.   Unless otherwise  specified, boolean  options default  to
  188%   `false`.
  189%
  190%   If Regex is a text pattern  (optionally with flags), then any of the
  191%   Options for  re_compile/3 can  be used, in  addition to  the Options
  192%   listed below. If Regex is the  result of re_compile/3, then only the
  193%   following execution-time  Options are recognized and  any others are
  194%   ignored. Some options may not exist on your system, depending on the
  195%   PCRE2 version and  how it was built - these  unsupported options are
  196%   silently ignored.
  197%
  198%     * start(From)
  199%     Start at the given character index
  200%     * anchored(Bool)
  201%     If `true`, match only at the first position
  202%     * bol(Bool)
  203%     String is the beginning of a line (default `true`) -
  204%       affects behavior of circumflex metacharacter (`^`).
  205%     * empty(Bool)
  206%     An empty string is a valid match (default `true`)
  207%     * empty_atstart(Bool)
  208%     An empty string at the start of the subject is a valid match
  209%     (default `true`)
  210%     * eol(Bool)
  211%     String is the end of a line -
  212%       affects behavior of dollar metacharacter (`$`)
  213%       (default `true`).
  214%     * newline(Mode)
  215%     If `any`, recognize any Unicode newline sequence,
  216%     if `anycrlf`, recognize CR, LF, and CRLF as newline
  217%     sequences, if `cr`, recognize CR, if `lf`, recognize
  218%     LF, if `crlf` recognize CRLF as newline.
  219%     The default is determined by how PCRE was built, and
  220%     can be found by re_config(newline2(NewlineDefault)).
  221%     * newline2(Mode) - synonym for newline(Mode).
  222%     * utf_check(Bool) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  223%     You should not need this because SWI-Prolog ensures that the UTF8 strings are valid,
  224%     so the default is `false`.
  225%     * endanchored(Bool) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  226%     * partial_soft(Bool) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  227%     * partial_hard(Bool) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  228%     * dfa_restart(Bool) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  229%     * dfa_shortest(Bool) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  230%
  231%   @arg  Regex is  the  output of  re_compile/3, a  pattern  or a  term
  232%   Pattern/Flags, where Pattern is an atom or string. The defined flags
  233%   and their related option for re_compile/3 are below.
  234%     - *x*: extended(true)
  235%     - *i*: caseless(true)
  236%     - *m*: multiline(true)
  237%     - *s*: dotall(true)
  238%     - *a*: capture_type(atom)
  239%     - *r*: capture_type(range)
  240%     - *t*: capture_type(term)
  241%
  242%   If Regex is the output  of re_compile/3, any compile-time options in
  243%   Options or Flags are ignored and only match-time options are used.
  244%
  245%   The options  that are  derived from flags  take precedence  over the
  246%   options in the  Options list. In the case of  conflicting flags, the
  247%   first one is used (e.g., `ra` results in `capture_type(range)`).
  248
  249re_match(Regex, String) :-
  250    re_match(Regex, String, []).
  251re_match(Regex, String, Options) :-
  252    re_compiled(Regex, Compiled, Options),
  253    re_match_(Compiled, String, Options).
  254
  255%!  re_matchsub(+Regex, +String, -Sub:dict) is semidet.
  256%!  re_matchsub(+Regex, +String, -Sub:dict, +Options) is semidet.
  257%
  258%   Match String  against Regex.  On  success, Sub is a  dict containing
  259%   integer keys  for the numbered capture  group and atom keys  for the
  260%   named capture groups. The entire match  string has the key `0`.  The
  261%   associated  value is  determined  by  the capture_type(Type)  option
  262%   passed  to  re_compile/3, or  by  flags  if  Regex  is of  the  form
  263%   Pattern/Flags;  and may  be  specified at  the  level of  individual
  264%   captures  using  a  naming  convention for  the  caption  name.  See
  265%   re_compile/3 for details.
  266%
  267%   The  example  below  exploits  the  typed groups  to  parse  a  date
  268%   specification:
  269%
  270%     ```
  271%     ?- re_matchsub("(?<date> (?<year_I>(?:\\d\\d)?\\d\\d) -
  272%                     (?<month_I>\\d\\d) - (?<day_I>\\d\\d) )"/x,
  273%                    "2017-04-20", Sub, []).
  274%     Sub = re_match{0:"2017-04-20", date:"2017-04-20",
  275%                    day:20, month:4, year:2017}.
  276%
  277%     ```
  278%
  279%   @arg Both compilation and execution options are processed.  See
  280%   re_compile/3 and re_match/3 for the set of options. In addition,
  281%   some compilation options may passed as ``/Flags`` to Regex - see
  282%   re_match/3 for the list of flags.
  283%
  284%   @arg Regex  See re_match/2 for a description of this argument.
  285
  286re_matchsub(Regex, String, Subs) :-
  287    re_matchsub(Regex, String, Subs, []).
  288
  289re_matchsub(Regex, String, Subs, Options) :-
  290    re_compiled(Regex, Compiled, Options),
  291    re_matchsub_(Compiled, String, Pairs, Options),
  292    dict_pairs(Subs, re_match, Pairs).
  293
  294%!  re_foldl(:Goal, +Regex, +String, ?V0, ?V, +Options) is semidet.
  295%
  296%   Fold all matches of Regex on String.  Each match is represented by a
  297%   dict as specified  for re_matchsub/4.  V0 and V are  related using a
  298%   sequence of invocations of Goal as illustrated below.
  299%
  300%       ```
  301%       call(Goal, Dict1, V0, V1),
  302%       call(Goal, Dict2, V1, V2),
  303%       ...
  304%       call(Goal, Dictn, Vn, V).
  305%       ```
  306%
  307%   This predicate is used to implement re_split/4 and re_replace/4. For
  308%   example, we  can count all matches  of a Regex on  String using this
  309%   code:
  310%
  311%     ```
  312%     re_match_count(Regex, String, Count) :-
  313%         re_foldl(increment, Regex, String, 0, Count, []).
  314%
  315%     increment(_Match, V0, V1) :-
  316%         V1 is V0+1.
  317%     ```
  318%
  319%   After which we can query
  320%
  321%     ```
  322%     ?- re_match_count("a", "aap", X).
  323%     X = 2.
  324%     ```
  325%
  326%  Here is an example Goal for extracting all the matches with their
  327%  offsets within the string:
  328%
  329%  ```
  330%  range_match(Dict, StringIndex-[MatchStart-Substring|List], StringIndex-List) :-
  331%      Dict.(StringIndex.index) = MatchStart-MatchLen,
  332%      sub_string(StringIndex.string, MatchStart, MatchLen, _, Substring).
  333%  ```
  334%  And can be used with this query (note the capture_type(range) option,
  335%  which is needed by `range_match/3`, and greedy(false) to invert the
  336%  meaning of `*?`):
  337%  ```
  338%  ?- String = "{START} Mary {END} had a {START} little lamb {END}",
  339%     re_foldl(range_match,
  340%              "{START} *?(?<piece>.*) *?{END}",
  341%              String, _{string:String,index:piece}-Matches, _-[],
  342%              [capture_type(range),greedy(false)]).
  343%  Matches = [8-"Mary", 33-"little lamb"].
  344%  ```
  345re_foldl(Goal, Regex, String, V0, V, Options) :-
  346    re_compiled(Regex, Compiled, Options),
  347    re_foldl_(Compiled, String, Goal, V0, V, Options).
  348
  349:- public re_call_folder/4. % prevent code obfusication name mangling
  350:- meta_predicate re_call_folder(2, +, ?, ?).  351
  352%   re_call_folder(:Goal, +Pairs, ?V0, ?V1).
  353%   Used by re_foldl_/6 to call Goal with a dict.
  354%     DO NOT use "%!" comment - that would add it to the docs
  355re_call_folder(Goal, Pairs, V0, V1) :-
  356    dict_pairs(Dict, re_match, Pairs),
  357    call(Goal, Dict, V0, V1).
  358
  359
  360%!  re_split(+Pattern, +String, -Splits:list) is det.
  361%!  re_split(+Pattern, +String, -Splits:list, +Options) is det.
  362%
  363%   Split String using the regular  expression Pattern. Splits is a list
  364%   of strings holding alternating matches  of Pattern and skipped parts
  365%   of the String, starting with a  skipped part.  The Splits lists ends
  366%   with a  string of  the content  of String after  the last  match. If
  367%   Pattern does not  appear in String, Splits is a  list holding a copy
  368%   of String. This implies the number of elements in Splits is _always_
  369%   odd.  For example:
  370%
  371%     ```
  372%     ?- re_split("a+", "abaac", Splits, []).
  373%     Splits = ["","a","b","aa","c"].
  374%     ?- re_split(":\\s*"/n, "Age: 33", Splits, []).
  375%     Splits = ['Age', ': ', 33].
  376%     ```
  377%
  378%   @arg  Pattern is  the pattern  text, optionally  follows by  /Flags.
  379%   Similar to re_matchsub/4, the final output type can be controlled by
  380%   a flag `a` (atom), `s` (string, default) or `n` (number if possible,
  381%   atom otherwise).
  382
  383re_split(Pattern, String, Splits) :-
  384    re_split(Pattern, String, Splits, []).
  385re_split(Pattern, String, Splits, Options) :-
  386    split_range_regex(Pattern, Compiled, Type, Options),
  387    State = state(String, 0, Type),
  388    re_foldl(split(State), Compiled, String, Splits, [Last], Options),
  389    arg(2, State, LastSkipStart),
  390    typed_sub(Type, String, LastSkipStart, _, 0, Last).
  391
  392split_range_regex(Pattern/Flags, Compiled, Type, Options) =>
  393    split_range_regex(Pattern, Flags, Compiled, Type, Options).
  394split_range_regex(Pattern, Compiled, Type, Options) =>
  395    split_range_regex(Pattern, '', Compiled, Type, Options).
  396
  397split_range_regex(Pattern, Flags, Compiled, Type, Options) =>
  398    regex_capture_type_flag_chars(Flags, Chars, Options),
  399    split_flags(Chars, Chars1, Type),
  400    atom_chars(RFlags, [r|Chars1]),
  401    re_flags_options(RFlags, ROptions),
  402    append(ROptions, Options, Options2),
  403    re_compiled(Pattern/RFlags, Compiled, Options2).
  404
  405split_flags([], [], Type) :-
  406    default(Type, string).
  407split_flags([H|T0], T, Type) :-
  408    split_type(H, Type),
  409    !,
  410    split_flags(T0, T, Type).
  411split_flags([H|T0], [H|T], Type) :-
  412    split_flags(T0, T, Type).
  413
  414split_type(a, atom).
  415split_type(s, string).
  416split_type(n, name).
  417
  418split(State, Dict, [Skipped,Sep|T], T) :-
  419    matched(State, Dict.0, Sep),
  420    skipped(State, Dict.0, Skipped).
  421
  422matched(state(String, _, Type), Start-Len, Matched) :-
  423    typed_sub(Type, String, Start, Len, _, Matched).
  424
  425skipped(State, Start-Len, Skipped) :-
  426    State = state(String, Here, Type),
  427    SkipLen is Start-Here,
  428    typed_sub(Type, String, Here, SkipLen, _, Skipped),
  429    NextSkipStart is Start+Len,
  430    nb_setarg(2, State, NextSkipStart).
  431
  432typed_sub(string, Haystack, B, L, A, String) :-
  433    sub_string(Haystack, B, L, A, String).
  434typed_sub(atom, Haystack, B, L, A, String) :-
  435    sub_atom(Haystack, B, L, A, String).
  436typed_sub(name, Haystack, B, L, A, Value) :-
  437    sub_string(Haystack, B, L, A, String),
  438    (   number_string(Number, String)
  439    ->  Value = Number
  440    ;   atom_string(Value, String)
  441    ).
  442
  443%!  re_replace(+Pattern, +With, +String, -NewString) is det.
  444%!  re_replace(+Pattern, +With, +String, -NewString, +Options) is det.
  445%
  446%   Replace matches  of the  regular expression  Pattern in  String with
  447%   With (possibly containing references to captured substrings).
  448%
  449%   Throws  an error  if With  uses  a name  that doesn't  exist in  the
  450%   Pattern.
  451%
  452%   @arg Pattern  is the  pattern text,  optionally followed  by /Flags.
  453%   Flags  may include  `g`, replacing  all occurences  of Pattern.   In
  454%   addition, similar  to re_matchsub/4,  the final  output type  can be
  455%   controlled  by a  flag `a`  (atom)  or `s`  (string, default).   The
  456%   output  type can  also be  specified by  the `capture_type`  option.
  457%   Capture  type  suffixes  can   modify  behavior;  for  example,  the
  458%   following  will  change an  ISO  8601  format date  (YYYY-MM-DD)  to
  459%   American style (m/d/y),  and also remove leading zeros  by using the
  460%   `_I` suffix:
  461%
  462%   ```
  463%   re_replace("(?<date> (?<year_I>(?:\\d\\d)?\\d\\d) -
  464%               (?<month_I>\\d\\d) - (?<day_I>\\d\\d) )"/x,
  465%              "$month-$day-$year",
  466%              ISODate, AmericanDate)`
  467%   ```
  468%
  469%   @arg  With  is  the  replacement text.  It  may  reference  captured
  470%   substrings using \N or $Name. Both N  and Name may be written as {N}
  471%   and {Name} to avoid ambiguities. If  a substring is named, it cannot
  472%   be referenced by its number. The single chracters `$` and `\` can be
  473%   escaped  by  doubling  (e.g.,  `re_replace(".","$$","abc",Replaced)`
  474%   results in  `Replaced="$bc"`). (Because  `\` is an  escape character
  475%   inside strings, you need to write "\\\\" to get a single backslash.)
  476%
  477%   @arg Options See re_match/3 for the set of options.
  478%
  479%   The options  that are  derived from flags  take precedence  over the
  480%   options in the  Options list. In the case of  conflicting flags, the
  481%   first one  is used  (e.g., `as` results  in `capture_type(string)`).
  482%   If  a  `capture_type` is  meaningless  (`range`  or `term`),  it  is
  483%   ignored.
  484
  485re_replace(Pattern, With, String, NewString) :-
  486    re_replace(Pattern, With, String, NewString, []).
  487
  488re_replace(Pattern, With, String, NewString, Options) :-
  489    replace_range_regex(Pattern, Compiled, All, Type, Options),
  490    compile_replacement(With, RCompiled),
  491    State = state(String, 0, Type),
  492    (   All == all
  493    ->  re_foldl(replace(State, RCompiled), Compiled, String, Parts, [Last], [])
  494    ;   (   re_matchsub(Compiled, String, Match, [])
  495        ->  replace(State, RCompiled, Match, Parts, [Last])
  496        ;   Repl = false
  497        )
  498    ),
  499    (   Repl == false
  500    ->  parts_to_output(Type, [String], NewString)
  501    ;   arg(2, State, LastSkipStart),
  502        sub_string(String, LastSkipStart, _, 0, Last),
  503        parts_to_output(Type, Parts, NewString)
  504    ).
  505
  506regex_capture_type_flag_chars(Flags, Chars, Options) :-
  507    atom_chars(Flags, Chars0),
  508    % For replace or split, the capture_type must be range, so if a
  509    % different result is desired, it is specified in the flags. The
  510    % following code converts an Options capture_type to a flag
  511    % character and appends it to the Flags.
  512    (   memberchk(capture_type(T), Options),
  513        type_flag(TFlag, T)
  514    ->  % No need to do delete(Options,capture_type(_),Options2)
  515        % because Flags take precedence and first occurence in Options
  516        % takes precedence.
  517        append(Chars0, [TFlag], Chars)
  518    ;   Chars = Chars0
  519    ).
  520
  521%! replace_range_regex(+Pattern, -Compiled, -All, -Type, +Options) is det.
  522replace_range_regex(Pattern/Flags, Compiled, All, Type, Options) =>
  523    replace_range_regex(Pattern, Flags, Compiled, All, Type, Options).
  524replace_range_regex(Pattern, Compiled, All, Type, Options) =>
  525    replace_range_regex(Pattern, '', Compiled, All, Type, Options).
  526
  527replace_range_regex(Pattern, Flags, Compiled, All, Type, Options) =>
  528    regex_capture_type_flag_chars(Flags, Chars, Options),
  529    replace_flags(Chars, Chars1, All, Type),
  530    atom_chars(RFlags, [r|Chars1]),
  531    re_flags_options(RFlags, ROptions),
  532    append(ROptions, Options, Options2),
  533    re_compiled(Pattern, Compiled, Options2).
  534
  535replace_flags([], [], All, Type) :-
  536    default(All, first),
  537    default(Type, string).
  538replace_flags([H|T0], T, All, Type) :-
  539    (   all_flag(H, All)
  540    ->  true
  541    ;   type_flag(H, Type)
  542    ),
  543    !,
  544    replace_flags(T0, T, All, Type).
  545replace_flags([H|T0], [H|T], All, Type) :-
  546    replace_flags(T0, T, All, Type).
  547
  548all_flag(g, all).
  549
  550type_flag(a, atom).
  551type_flag(s, string).
  552
  553%! default(?Val, +Default) is det.
  554%  If Val isn't instantiated, instantiate it to Default.
  555%  If Val is already instantiated, succeed.
  556%  Equivalent to:
  557%     default( Val,  Default), var(Val) => Val = Default.
  558%     default(_Val, _Default) => true.
  559default(Val, Val) :- !.
  560default(_, _).
  561
  562replace(State, With, Dict, [Skipped|Parts], T) :-
  563    State = state(String, _, _Type),
  564    copy_term(With, r(PartsR, Skel)),
  565    maplist(dict_pair_lookup(Dict), Skel),
  566    range_strings(PartsR, String, Parts, T),
  567    skipped(State, Dict.0, Skipped).
  568
  569% dict_pair_lookup(d{a:1}, a-K) results in K=1.
  570dict_pair_lookup(Dict, Key-Dict.Key).
  571
  572range_strings([], _, T, T).
  573range_strings([Start-Len|T0], String, [S|T1], T) :-
  574    !,
  575    sub_string(String, Start, Len, _, S),
  576    range_strings(T0, String, T1, T).
  577range_strings([S|T0], String, [S|T1], T) :-
  578    range_strings(T0, String, T1, T).
  579
  580parts_to_output(string, Parts, String) :-
  581    atomics_to_string(Parts, String).
  582parts_to_output(atom, Parts, String) :-
  583    atomic_list_concat(Parts, String).
  584
  585%!  compile_replacement(+With, -Compiled)
  586%
  587%   Compile the replacement specification  into a specification that can
  588%   be processed quickly. The compiled expressions are cached and may be
  589%   reclaimed using  re_flush/0 (which also removes  compiled Regex from
  590%   re_compile/3).
  591%
  592%   This "compilation" has nothing to  do with PCRE pattern compilation;
  593%   it's used by re_replace/5 to process the With argument.
  594
  595:- table compile_replacement/2 as shared.  596
  597compile_replacement(With, r(Parts, Extract)) :-
  598    string_codes(With, Codes),
  599    phrase(replacement_parts(Parts, Pairs), Codes),
  600    % Pairs is LookupKey-Slot pairs, where a LookupKey might be
  601    % duplicated (Slot is a shared variable within Parts).
  602    Extract = Pairs.
  603
  604replacement_parts(Parts, Extract) -->
  605    string_escape(HCodes),
  606    (   ("\\" ; "$"),
  607        capture_name(Name)
  608    ->  !,
  609        { add_part(HCodes, Parts, T0),
  610          T0 = [Repl|T1],
  611          Extract = [Name-Repl|Extract1]
  612        },
  613        replacement_parts(T1, Extract1)
  614    ;   eos
  615    ->  !,
  616        { add_part(HCodes, Parts, []),
  617          Extract = []
  618        }
  619    ).
  620
  621add_part([], Parts, Parts) :-
  622    !.
  623add_part(Codes, [H|T], T) :-
  624    string_codes(H, Codes).
  625
  626%! string_escape(-Codes)// is nondet.
  627% Similar to dcg_basics:string(Codes) but also escapes "$" and "/"
  628string_escape([]) -->
  629    [].
  630string_escape([0'$|T]) -->
  631    "$$", !,
  632    string_escape(T).
  633string_escape([0'\\|T]) -->
  634    "\\\\", !,
  635    string_escape(T).
  636string_escape([H|T]) -->
  637    [H],
  638    string_escape(T).
  639
  640capture_name(Name) -->
  641    "{",
  642    (   digit(D0)
  643    ->  digits(DL),
  644        "}",
  645        { number_codes(Name, [D0|DL]) }
  646    ;   letter(A0),
  647        alnums(AL),
  648        "}",
  649        { atom_codes(Name, [A0|AL]) }
  650    ).
  651capture_name(Name) -->
  652    digit(D0),
  653    !,
  654    digits(DL),
  655    { number_codes(Name, [D0|DL]) }.
  656capture_name(Name) -->
  657    letter(A0),
  658    !,
  659    alnums(AL),
  660    { atom_codes(Name, [A0|AL]) }.
  661
  662letter(L) -->
  663    [L],
  664    { between(0'a,0'z,L)
  665    ; between(0'A,0'Z,L)
  666    ; L == 0'_
  667    }, !.
  668
  669alnums([H|T]) -->
  670    alnum(H),
  671    !,
  672    alnums(T).
  673alnums([]) -->
  674    "".
  675
  676alnum(L) -->
  677    [L],
  678    { between(0'a,0'z,L)
  679    ; between(0'A,0'Z,L)
  680    ; between(0'0,0'9,L)
  681    ; L == 0'_
  682    }, !.
  683
  684%!  re_compile(+Pattern, -Regex, +Options) is det.
  685%
  686%   Compiles Pattern  to a  Regex _blob_ of  type `regex`  (see blob/2).
  687%   Defined  Options are  given below.   Please consult  the [PCRE2  API
  688%   documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  689%   for details.  If an option is  repeated, the first value is used and
  690%   subsequent values  are ignored.   Unrecognized options  are ignored.
  691%   Unless otherwise specified, boolean options default to `false`. Some
  692%   options may not exist on your system, depending on the PCRE2 version
  693%   and  how it  was  built  - these  unsupported  options are  silently
  694%   ignored.
  695%
  696%   The various matching predicates can take  either a Regex _blob_ or a
  697%   string  pattern; if  they  are  given a  string  pattern, they  call
  698%   re_compile/3 and cache the result; so, there is little reason to use
  699%   re_compile/3 directly.
  700%
  701%     * anchored(Bool)
  702%     If `true`, match only at the first position
  703%     * auto_capture(Bool)
  704%     Enable use of numbered capturing parentheses.
  705%     (default `true`)
  706%     * bsr(Mode)
  707%     If `anycrlf`, \R only matches CR, LF or CRLF;  if `unicode`,
  708%     \R matches all Unicode line endings.
  709%     * bsr2(Mode) - synonym for bsr(Mode).
  710%     * caseless(Bool)
  711%     If `true`, do caseless matching.
  712%     * compat(With)
  713%     Error   -   PCRE1   had  =|compat(javascript)|=   for   JavaScript
  714%     compatibility, but PCRE2 has removed that.
  715%     * dollar_endonly(Bool)
  716%     If `true`, $ not to match newline at end
  717%     * dotall(Bool)
  718%     If `true`, . matches anything including NL
  719%     * dupnames(Bool)
  720%     If `true`, allow duplicate names for subpatterns
  721%     * extended(Bool)
  722%     If `true`, ignore white space and # comments
  723%     * firstline(Bool)
  724%     If `true`, force matching to be before newline
  725%     * greedy(Bool)
  726%     If  `true`,  operators such  as  `+`  and  `*` are  greedy  unless
  727%     followed by `?`; if `false`, the  operators are not greedy and `?`
  728%     has the opposite meaning. It can also beset by a `(?U)` within the
  729%     pattern  -   see  the  [PCRE2  pattern   internal  option  setting
  730%     documentation](https://www.pcre.org/current/doc/html/pcre2pattern.html#SEC13)
  731%     for details and note that the PCRE2 option is `UNGREEDY`, which is
  732%     the inverse of this packages `greedy` options.  (default `true`)
  733%     * compat(With)
  734%     Raises an  errr - PCRE1 had  =|compat(javascript)|= for JavaScript
  735%     compatibility, but PCRE2 has removed that option . Consider using
  736%     the `alt_bsux` and `extra_alt_bsux` options.
  737%     * multiline(Bool)
  738%     If `true`, ^ and $ match newlines within data
  739%     * newline(Mode)
  740%     If  `any`, recognize  any Unicode  newline sequence;  if `anycrlf`
  741%     (default), recognize  CR, LF,  and CRLF  as newline  sequences; if
  742%     `cr`, recognize CR;  if `lf`, recognize LF;  `crlf` recognize CRLF
  743%     as  newline; if  `nul`,  recognize the  NULL  character (0x00)  as
  744%     newline.
  745%     * newline2(Mode) - synonym for newline(Mode).
  746%     * ucp(Bool)
  747%     If `true`, use Unicode properties for \d, \w, etc.
  748%     * utf_check(Bool) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  749%     You should not need this because SWI-Prolog ensures that the UTF8 strings are valid,
  750%     * endanchored(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  751%     * allow_empty_class(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  752%     * alt_bsux(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  753%     * auto_callout(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  754%     * match_unset_backref(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  755%     * never_ucp(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  756%     * never_utf(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  757%     * auto_possess(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  758%     (default `true`)
  759%     * dotstar_anchor(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  760%     (default `true`)
  761%     * start_optimize(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  762%     (default `true`)
  763%     * utf(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  764%     * never_backslash_c(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  765%     * alt_circumflex(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  766%     * alt_verbnames(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  767%     * use_offset_limit(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  768%     * extended_more(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  769%     * literal(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  770%     * match_invalid_utf(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  771%     * jit_complete(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  772%     * jit_partial_soft(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  773%     * jit_partial_hard(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  774%     * jit_invalid_utf(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  775%     * jit(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  776%     (default `true`)
  777%     * copy_matched_subject(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  778%
  779%   In addition to the options above that directly map to PCRE flags the
  780%   following options are processed:
  781%
  782%     * optimise(Bool) or optimize(Bool)
  783%     Turns on the JIT compiler for additional optimization that greatly
  784%     that speeds  up the matching  performance of many  patterns. (Note
  785%     that he meaning has changed slightly from the PCRE1 implementation
  786%     - PCRE2  always optimises  where possible;  this is  an additional
  787%     optimisation.)
  788%     * capture_type(+Type)
  789%     How to return the matched part  of the input and possibly captured
  790%     groups in there.  Possible values are:
  791%       - string
  792%       Return the captured string as a string (default).
  793%       - atom
  794%       Return the captured string as an atom.
  795%       - range
  796%       Return the captured string as a pair `Start-Length`.  Note that
  797%       we use `Start-Length` rather than the more conventional
  798%       `Start-End` to allow for immediate use with sub_atom/5 and
  799%       sub_string/5.
  800%       - term
  801%       Parse the  captured string  as a Prolog  term.  This  is notably
  802%       practical if you capture a number.
  803%
  804%    The  `capture_type` specifies  the default  for this  pattern.  The
  805%    interface supports  a different type  for each _named_  group using
  806%    the syntax  `(?<name_T>...)`, where `T`  is one of  ``S`` (string),
  807%    ``A`` (atom), ``I`` (integer), ``F`` (float), ``N`` (number), ``T``
  808%    (term)  and ``R``  (range).  In the  current implementation  ``I``,
  809%    ``F`` and  ``N`` are synonyms  for ``T``.  Future versions  may act
  810%    different if the parsed value is not of the requested numeric type.
  811%
  812%    Note that re_compile/3 does not support the Pattern/Flags form that
  813%    is supported by re_match/3, re_replace/4, etc.; the Pattern must be
  814%    text and all compile options specified in Options.
  815
  816%!  re_compiled(+Spec, --Regex, +Options) is det.
  817%
  818%   Create  a  compiled regex  from  a  specification.  Cached  compiled
  819%   regular expressions  can be  reclaimed using re_flush/0  (which also
  820%   removes   "compiled"   With    arguments   from   re_replace/4   and
  821%   re_replace/5).
  822
  823:- table re_compiled_/4 as shared.  824
  825re_compiled(RegexIn, Regex, Options) :-
  826    (   blob(RegexIn, regex)
  827    ->  Regex = RegexIn
  828    ;   RegexIn = Text/Flags
  829    ->  re_compiled_(Text, Flags, Regex, Options)
  830    ;   re_compiled_(RegexIn, '', Regex, Options)
  831    ).
  832
  833re_compiled_(Text, Flags, Regex, Options) =>
  834    must_be(text, Text),
  835    must_be(atom, Flags),
  836    re_flags_options(Flags, Options0),
  837    append(Options0, Options, Options2),
  838    re_compile(Text, Regex, Options2).
  839
  840re_flags_options(Flags, Options) :-
  841    atom_chars(Flags, Chars),
  842    maplist(re_flag_option, Chars, Options).
  843
  844re_flag_option(Flag, Option) :-
  845    re_flag_option_(Flag, Option),
  846    !.
  847re_flag_option(Flag, _) :-
  848    existence_error(re_flag, Flag).
  849
  850re_flag_option_(i, caseless(true)).
  851re_flag_option_(m, multiline(true)).
  852re_flag_option_(x, extended(true)).
  853re_flag_option_(s, dotall(true)).
  854re_flag_option_(a, capture_type(atom)).
  855re_flag_option_(r, capture_type(range)).
  856re_flag_option_(t, capture_type(term)).
  857
  858%!  re_flush
  859%
  860%   Clean pattern and replacement caches.
  861%
  862%   @tbd Flush automatically if the cache becomes too large.
  863
  864re_flush :-
  865    abolish_module_tables(pcre).
  866
  867%!  re_config(?Term)
  868%
  869%   Extract configuration information from the  pcre library. Term is of
  870%   the   form    ``Name(Value)``.    Name    is   derived    from   the
  871%   ``PCRE_CONFIG_*``  constant  after   removing  ``PCRE_CONFIG_``  and
  872%   mapping the name to  lower case, e.g.  `utf8`, `unicode_properties`,
  873%   etc.  Value is a Prolog boolean, integer, or atom. For boolean (1 or
  874%   0) values, `true` or `false` is returned.
  875%
  876%   re_config/1 will  backtrack through  all the  possible configuration
  877%   values  if its  argument  is a  variable. If  an  unknown option  is
  878%   specified, re_config/1 fails.
  879%
  880%   Non-compatible  changes  between  PCRE1 and  PCRE2  because  numeric
  881%   values changed: `bsr` and `newline` have been replaced by `bsr2` and
  882%   `newline2`:
  883%     * `bsr2` - previously `bsr` returned 0 or 1; now returns `unicode`
  884%       or `anycrlf`
  885%     * `newline2`  -  previously  `newline` returned  an  integer,  now
  886%       returns `cr`, `lf`, `crlf`, `any`, `anycrlf`, `nul`
  887%
  888%  Term values are as follows. Some values might not exist, depending on
  889%  the version of PCRE2 and the options it was built with.
  890%
  891%   * bsr2
  892%     The character  sequences that the `\R` escape sequence  matches by
  893%     default. Replaces `bsr` option from PCRE1, which is not compatible.
  894%   * compiled_widths
  895%     An integer whose  lower bits indicate which code  unit widths were
  896%     selected when PCRE2 was built.  The 1-bit indicates 8-bit support,
  897%     and  the  2-bit and  4-bit  indicate  16-bit and  32-bit  support,
  898%     respectively. The 1  bit should always be set  because the wrapper
  899%     code requires 8 bit support.
  900%   * depthlimit
  901%   * heaplimit
  902%   * jit
  903%     `true` if just-in-time compiling is available.
  904%   * jittarget
  905%     A string containing the name of the architecture for which the JIT
  906%     compiler is configured. e.g., 'x86 64bit (little endian + unaligned)'.
  907%   * linksize
  908%   * matchlimit
  909%   * never_backslash_c
  910%   * newline2
  911%     An atom whose value specifies  the default character sequence that
  912%     is  recognized as  meaning "newline"  (`cr`, `lf`,  `crlf`, `any`,
  913%     `anycrlf`, `nul`).  Replaces `newline` option from PCRE1, which is
  914%     not compatible.
  915%   * parenslimit
  916%   * stackrecurse
  917%   * unicode
  918%     Always `true`
  919%   * unicode_version
  920%     The unicode version as an atom, e.g. '12.1.0'.
  921%   * utf8 - synonym for `unicode`
  922%   * parens_limit
  923%   * version
  924%   The  version information  as an  atom, containing  the PCRE  version
  925%   number and release date, e.g. '10.34 2019-11-21'.
  926%
  927%   For backwards compatibility with  PCRE1, the following are accepted,
  928%   but are deprecated:
  929%     * `utf8` - synonym for `unicode`
  930%     * `link_size` - synonym for `linksize`
  931%     * `match_limit` - synonym for `matchlimit`
  932%     * `parens_limit` - synonym for `parenslimit`
  933%     * `unicode_properties` - always true
  934%   The following  have been removed  because they don't exist  in PCRE2
  935%   and don't seem to have any meaningful use in PCRE1:
  936%     * `posix_malloc_threshold`
  937%     * `match_limit_recursion`
  938
  939%   @see `man pcre2api` for details
  940
  941re_config(Term), var(Term) =>
  942    re_config_choice(Term),
  943    % This code depends on re_config_/1 failing if it's given an invalid
  944    % Term (e.g., re_config_(jittarget(_)) fails if jit(false)). If
  945    % re_config_/1 is changed to throw an error, then the following call
  946    % needs to be inside catch/3.
  947    re_config_(Term).
  948re_config(Term) =>
  949    re_config_(Term).
  950
  951		 /*******************************
  952		 *            COMPILE		*
  953		 *******************************/
  954
  955:- create_prolog_flag(re_compile, false, [keep(true)]).  956
  957expand_regex(Regex, String, Options, Goal) :-
  958    \+ blob(Regex, regex),
  959    re_compiled(Regex, Compiled, Options),
  960    Goal = (pcre:re_match_(Compiled, String, Options)).
  961
  962
  963:- multifile
  964    user:goal_expansion/2.  965
  966user:goal_expansion(re_match(Regex, String), Compiled) :-
  967    ground(Regex),
  968    current_prolog_flag(re_compile, true),
  969    expand_regex(Regex, String, [], Compiled).
  970user:goal_expansion(re_match(Regex, String, Options), Compiled) :-
  971    ground(Regex),
  972    ground(Options),
  973    current_prolog_flag(re_compile, true),
  974    expand_regex(Regex, String, Options, Compiled)