View source with formatted comments or as raw
    1/*  Part of SWISH
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        jan@swi-prolog.org
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (C): 2024, SWI-Prolog Solutions b.v.
    7    All rights reserved.
    8
    9    Redistribution and use in source and binary forms, with or without
   10    modification, are permitted provided that the following conditions
   11    are met:
   12
   13    1. Redistributions of source code must retain the above copyright
   14       notice, this list of conditions and the following disclaimer.
   15
   16    2. Redistributions in binary form must reproduce the above copyright
   17       notice, this list of conditions and the following disclaimer in
   18       the documentation and/or other materials provided with the
   19       distribution.
   20
   21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   24    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   25    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   26    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   27    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   28    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   29    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   31    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   32    POSSIBILITY OF SUCH DAMAGE.
   33*/
   34
   35:- module(es_swish,
   36          [ es_create_index/0,
   37            es_add_file/1,                        % +File
   38            es_add/2,                             % +Offset, +Limit
   39            es_add_since/1,                       % +Time
   40            es_query/2
   41          ]).   42:- use_module(elastic).   43:- use_module('../storage').   44:- use_module('../authenticate').   45:- use_module(library(base64)).   46:- use_module(library(broadcast)).   47:- use_module(library(solution_sequences)).   48:- use_module(library(apply)).   49:- use_module(library(pprint)).   50:- use_module(library(debug)).   51:- use_module(library(lists)).   52:- use_module(library(http/http_server)).   53
   54/** <module> SWISH integration of Elastic Search
   55
   56This module hooks into storage.pl  to   provide  full  text and metadata
   57search over the file storage. This  module   listens  to  saved files to
   58index new or updated files.
   59*/
   60
   61%!  es_create_index
   62%
   63%   Create an Elastic Search index to represent commits.
   64
   65es_create_index :-
   66    commit_properties(Properties),
   67    es_create_index(swish, Properties).
   68
   69%!  es_add_since(+Time)
   70%
   71%   Update entries that have  been  added   or  modified  less than Time
   72%   seconds ago.
   73
   74es_add_since(Time) :-
   75    get_time(Now),
   76    Since is Now - Time,
   77    forall(call_nth((storage_meta_data(File, Meta),
   78                     Meta.time > Since),
   79                    Nth),
   80           es_add_file(File, 0, Nth)).
   81
   82%!  es_add(+Offset, +Limit) is det.
   83%
   84%   Add files to the index iven an offset and limit.
   85%
   86%   @tbd: see whether the bulk insert option of Elastic search provides
   87%   a significant speedup. See
   88%   https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-bulk.html)
   89
   90es_add(Offset, Limit) :-
   91    forall(call_nth(limit(Limit, offset(Offset, storage_file(File))), Nth),
   92           es_add_file(File, Offset, Nth)).
   93
   94es_add_file(File, Offset, N) :-
   95    Nth is Offset+N,
   96    format(user_error, '\r~D ~w ... \e[K', [Nth, File]),
   97    Error = error(_,_),
   98    catch(es_add_file(File), Error,
   99          print_message(warning, Error)).
  100
  101%!  es_add_file(+File) is det.
  102%
  103%   Add a file to the `swish` index.
  104%
  105%   @error May throw Gitty, Redis or Elastic errors.  Bulk adding
  106%   should typically ignore these.
  107
  108es_add_file(File) :-
  109    storage_file(File, Data, Meta),
  110    !,
  111    file_name_extension(_Base, Ext, File),
  112    atom_concat('p/', File, IdPath),
  113    base64url(IdPath, Id0),
  114    str_limit(Id0, 511, Id), % Elastic _id is limited to 512 bytes
  115    es_add(swish, Meta.put(_{ content: Data,
  116                              type:Ext
  117                            }), _Reply,
  118           [ id(Id)
  119           ]).
  120es_add_file(File) :-
  121    print_message(warning, gitty(no_file(File))).
  122
  123str_limit(Id, Limit, Id) :-
  124    string_length(Id, Len),
  125    Len =< Limit,
  126    !.
  127str_limit(Id0, Limit, Id) :-
  128    sub_string(Id0, 0, Limit, _, Id).
  129
  130%!  es_query(+Query, -Result) is det.
  131%
  132%   Run a query against the Elastic database.
  133
  134es_query(For, Result) :-
  135    es_search(swish, For, Result).
  136
  137commit_properties(#{ name: #{type: keyword},      % File name
  138
  139                     author: #{type: keyword},    % Ownership
  140                     avatar: #{type: keyword},
  141                     commit: #{type: keyword},
  142                     identity: #{type: keyword},
  143                     email: #{type: keyword},
  144                     profile_id: #{type: keyword},
  145                     peer: #{type: ip, ignore_malformed:true},
  146
  147                     title: #{type: text},        % Search metadata
  148                     description: #{type: text},
  149                     commit_message: #{type: text},
  150                     type: #{type: keyword},
  151                     tags: #{type: keyword},
  152                     example: #{type: boolean},
  153
  154                     data: #{type: keyword},      % Content
  155                     previous: #{type: keyword},
  156                     content: #{type: text},
  157                     time: #{ type: date,
  158                              format:"epoch_second"
  159                            },
  160                                                  % Access permissions
  161                     modify: #{type: keyword},    % list of any, login or owner
  162                     public: #{type: boolean}
  163                   }).
  164
  165
  166		 /*******************************
  167		 *        SEARCH PLUGIN		*
  168		 *******************************/
  169
  170:- multifile
  171    web_storage:search_sources_hook/2.  172
  173%!  web_storage:search_sources_hook(+Query, -Results) is det.
  174%
  175%   Run full text and/or metadata search over the stored files.
  176
  177web_storage:search_sources_hook(Query, Result) :-
  178    (   catch_with_backtrace(es_search_sources_hook(Query, Result), E, true)
  179    ->  (   var(E)
  180        ->  true
  181        ;   message_to_string(E, Msg),
  182            Result = #{ error: Msg },
  183            (   debugging(elastic)
  184            ->  print_term(Result, [nl(true)])
  185            ;   true
  186            )
  187        )
  188    ;   Result = #{ error: "failed" }
  189    ).
  190
  191es_search_sources_hook(Query, Result) :-
  192    (   debugging(elastic)
  193    ->  print_term(Query, [nl(true)])
  194    ;   true
  195    ),
  196    partition(on_content, Query.query, OnContent, OnMeta),
  197    phrase(es_filter(Query, OnMeta), Filter),
  198    phrase(es_content(OnContent), Match),
  199    es_ordering(OnContent, OnMeta, Query, Field, Dir),
  200    dict_pairs(OrderBy, #, [Field - #{order:Dir}]),
  201    ESQuery = #{query:
  202                #{bool:
  203                  #{ must: Match,
  204                     filter: Filter
  205                   }
  206                 },
  207                '_source': false,        % do not include source
  208                fields:
  209                [ name,                  % select fields
  210                  time,
  211                  author,
  212                  avatar,
  213                  tags
  214                ],
  215                track_total_hits: true,  % default
  216                highlight:
  217                #{ fields:
  218                   #{ content: #{}}
  219                 },
  220                sort: [
  221                    OrderBy
  222                ],
  223                from: Query.get(offset, 0),
  224                size: Query.get(limit, 10)
  225               },
  226    (   debugging(elastic)
  227    ->  print_term(ESQuery, [nl(true)])
  228    ;   true
  229    ),
  230    es_query(ESQuery, Matches),
  231    es_to_swish(Matches, Result).
  232
  233on_content(word(_)).
  234on_content(string(_)).
  235on_content(regex(_,_)).
  236
  237%!  es_content(+Query)//
  238%
  239%   Get the content filtering from Query
  240
  241es_content([]) -->
  242    [].
  243es_content([H|T]) -->
  244    es_content_1(H),
  245    es_content(T).
  246
  247es_content_1(word(W)) -->
  248    [ #{match: #{content: W}} ].
  249es_content_1(string(W)) -->
  250    [ #{match_phrase: #{content: W}} ].
  251es_content_1(regex(RE,Flags)) -->
  252    field_filter(regex(RE,Flags), content).
  253
  254%!  es_filter(+Query, OnMeta)//
  255%
  256%   Obtain the meta-data filtering from the query.
  257
  258es_filter(Query, OnMeta) -->
  259    es_owner_filter(Query, OnMeta),
  260    es_tag_filter(OnMeta),
  261    es_type_filter(OnMeta),
  262    es_name_filter(OnMeta).
  263
  264es_owner_filter(Query, OnMeta) -->		% private search
  265    { memberchk(user("me"), OnMeta),
  266      user_property(Query.auth, identity(Id))
  267    },
  268    !,
  269    [ #{term: #{identity: Id}} ].
  270es_owner_filter(Query, OnMeta) -->		% private search
  271    { memberchk(user("me"), OnMeta),
  272      NickName = Query.auth.get(display_name)
  273    },
  274    !,
  275    field_filter(NickName, author),
  276    public_filter.
  277es_owner_filter(Query, OnMeta) -->		% private search
  278    { memberchk(user("me"), OnMeta),
  279      user_property(Query.auth, avatar(Avatar))
  280    },
  281    !,
  282    [ #{term: #{avatar: Avatar}} ],
  283    public_filter.
  284es_owner_filter(_Query, OnMeta) -->
  285    { memberchk(user(User), OnMeta),
  286      User \== "me"
  287    },
  288    field_filter(User, author),
  289    public_filter.
  290es_owner_filter(_, _) -->
  291    public_filter.
  292
  293public_filter -->
  294    [ #{term: #{public: true}} ].
  295
  296es_tag_filter(Query) -->
  297    { memberchk(tag(Tag), Query) },
  298    !,
  299    field_filter(Tag, tags).
  300es_tag_filter(_) -->
  301    [].
  302
  303es_name_filter(Query) -->
  304    { memberchk(name(Name), Query) },
  305    !,
  306    field_filter(Name, tags).
  307es_name_filter(_) -->
  308    [].
  309
  310es_type_filter(Query) -->
  311    { memberchk(type(Type), Query) },
  312    !,
  313    [ #{term: #{type: Type}}].
  314es_type_filter(_) -->
  315    [].
  316
  317%!  field_filter(+Spec, +Field)//
  318%
  319%   Add a filter on the given meta-data
  320
  321field_filter(regex(RE,Flags), Field) -->
  322    !,
  323    { convlist(re_pair, Flags, REOpts),
  324      dict_pairs(REDict, #, [Field - REProps]),
  325      dict_pairs(REProps, #, [value-RE|REOpts])
  326    },
  327    [ #{regexp: REDict} ].
  328field_filter(String, Field) -->
  329    { dict_pairs(WCDict, #, [Field - WCProps]),
  330      dict_pairs(WCProps, #, [value-String, case_insensitive-true])
  331    },
  332    [ #{wildcard: WCDict} ].
  333
  334re_pair(i, case_insensitive-true).
  335
  336%!  es_ordering(+OnContent, +OnMeta, +Query, -Field, -Dir) is det.
  337
  338es_ordering(_OnContent, _OnMeta, Query, Field, Dir) :-
  339    _{ order_by:Field, order: Dir} :< Query,
  340    !.
  341es_ordering(_OnContent, _OnMeta, Query, Field, Dir) :-
  342    _{ order_by:Field} :< Query,
  343    !,
  344    (   Field == time
  345    ->  Dir = desc
  346    ;   Dir = asc
  347    ).
  348es_ordering(OnContent, _OnMeta, _Query, '_score', desc) :-
  349    OnContent \== [],
  350    !.
  351es_ordering(_OnContent, _OnMeta, _Query, 'time', desc).
  352
  353%!  es_to_swish(+ESResuls, -Results) is det.
  354%
  355%   Extract our compatible results from the Elastic result set.
  356
  357es_to_swish(ESResult, #{ matches:Matches,
  358                         total:Total,
  359                         cpu:CPU,
  360                         cache:false
  361                       }) :-
  362    Hits = ESResult.hits,
  363    CPU is ESResult.get(took, 0)/1000.0,
  364    Total = Hits.total.value,
  365    maplist(es_to_swish_hit, Hits.hits, Matches).
  366
  367es_to_swish_hit(ESHit, Match) :-
  368    dict_pairs(ESHit.fields, _, Pairs),
  369    convlist(to_single, Pairs, RPairs),
  370    add_highlight(ESHit, RPairs, RPairs1),
  371    dict_pairs(Match, #, RPairs1).
  372
  373to_single(tags-Tags, tags-Tags) :- !.
  374to_single(Field-[Value], Field-Value) :- !.
  375
  376add_highlight(ESHit, Pairs, [highlight-Lines|Pairs]) :-
  377    Lines = ESHit.get(highlight).get(content),
  378    !.
  379add_highlight(_, Pairs, Pairs).
  380
  381
  382		 /*******************************
  383		 *          TYPEAHEAD		*
  384		 *******************************/
  385
  386:- multifile web_storage:typeahead_hooked/1.  387
  388web_storage:typeahead_hooked(file).
  389web_storage:typeahead_hooked(store_content).
  390
  391:- multifile swish_search:typeahead/4.       % +Set, +Query, -Match, +Options
  392
  393%!  swish_search:typeahead(+Set, +Query, -Result, +Options) is nondet.
  394%
  395%   Search for typeahead of the top-right search box.
  396
  397swish_search:typeahead(file, For, FileInfo, _Options) :-
  398    atom_concat(For, *, Pattern),
  399    ESQuery = #{query:
  400                #{bool:
  401                  #{ must:
  402                     [ #{query_string: #{
  403                             query: Pattern,
  404                             fields: [name, title, tags]
  405                         }}
  406                     ],
  407                     filter: [ #{term: #{public: true}} ]
  408                   }
  409                 },
  410                '_source': false,
  411                fields:
  412                [ name,                  % select fields
  413                  time,
  414                  author,
  415                  avatar,
  416                  tags
  417                ],
  418                sort:[#{time: #{order:desc}}],
  419                from: 0,
  420                size: 10
  421               },
  422    es_query(ESQuery, Matches),
  423    es_to_swish(Matches, Result),
  424    member(Hit, Result.matches),
  425    File = Hit.name,
  426    FileInfo = Hit.put(_{type:"store", file:File}).
  427
  428swish_search:typeahead(store_content, Text, FileInfo, Options) :-
  429    http_current_request(Request),
  430    authenticate(Request, Auth),
  431    limit(25, se_typeahead(store_content, Text, FileInfo,
  432                           Options.put(auth,Auth))).
  433
  434se_typeahead(store_content, Text, FileInfo, Options) :-
  435    option(auth(Auth), Options),
  436    phrase(es_owner_filter(#{auth:Auth.put(Options)}, [user("me")]), Filter),
  437    Query = #{query:
  438              #{bool:
  439                #{ must:
  440                   [ #{match: #{content: Text}}
  441                   ],
  442                   filter: Filter
  443                 }
  444               },
  445              '_source': false,        % do not include source
  446              fields:
  447              [ name,                  % select fields
  448                time,
  449                author,
  450                avatar,
  451                tags
  452              ],
  453              track_total_hits: true,  % default
  454              highlight:
  455              #{ fields:
  456                 #{ content:
  457                    #{ number_of_fragments: 1
  458                     }
  459                  },
  460                 encoder: html,
  461                 boundary_chars: '\n'
  462               },
  463              sort:[#{time: #{order:desc}}],
  464              from:0, size:10
  465             },
  466    (   debugging(elastic)
  467    ->  print_term(Query, [nl(true)])
  468    ;   true
  469    ),
  470    es_query(Query, Matches),
  471    es_to_swish(Matches, Result),
  472    member(Hit, Result.matches),
  473    File = Hit.name,
  474    member(Line, Hit.highlight),
  475    FileInfo = Hit.put(_{type:"store", file:File,
  476                         line: 0, text:Line, query:Text,
  477                         encoder: html
  478                        }).
  479
  480
  481		 /*******************************
  482		 *           UPDATES		*
  483		 *******************************/
  484
  485% Listen to new and updated  files  to   update  the  index. This should
  486% normally be fine to keep the index   up-to-date as this is executed by
  487% the swish instance that makes the change.   Only  a badly timed crash,
  488% i.e., between adding to the  store   and  this  broadcast request, can
  489% cause a new commit to be lost.
  490
  491% In addition, es_add_since/1 may be  used   to  simply update all files
  492% added or modified since some time period. This is slow though.
  493
  494:- listen(swish(created(File, _Commit)),
  495          es_add_file(File)).  496:- listen(swish(updated(File, _Commit)),
  497          es_add_file(File)).