View source with raw comments or as raw
    1/*  Part of SWISH
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@cs.vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (C): 2017, VU University Amsterdam
    7			 CWI Amsterdam
    8    All rights reserved.
    9
   10    Redistribution and use in source and binary forms, with or without
   11    modification, are permitted provided that the following conditions
   12    are met:
   13
   14    1. Redistributions of source code must retain the above copyright
   15       notice, this list of conditions and the following disclaimer.
   16
   17    2. Redistributions in binary form must reproduce the above copyright
   18       notice, this list of conditions and the following disclaimer in
   19       the documentation and/or other materials provided with the
   20       distribution.
   21
   22    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   23    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   24    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   25    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   26    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   27    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   28    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   29    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   30    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   31    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   32    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   33    POSSIBILITY OF SUCH DAMAGE.
   34*/
   35
   36:- module(swish_data_source,
   37          [ data_source/2,              % :Id, +Source
   38            data_record/2,              % :Id, -Record
   39            record/2,                   % :Id, -Record
   40            data_property/2,            % :Id, ?Property
   41            data_row/2,                 % :Id, -Row
   42            data_row/4,                 % :Id, +Range, +Header, -Row
   43            data_dump/3,                % :Id, +Range, -Row
   44
   45            data_flush/1,               % +Hash
   46            'data assert'/1,            % +Term
   47            'data materialized'/3,	% +Hash, +Signature, +SourceID
   48            'data failed'/2		% +Hash, +Signature
   49          ]).   50:- use_module(library(error)).   51:- use_module(library(lists)).   52:- use_module(library(settings)).   53:- use_module(library(solution_sequences)).   54:- use_module(library(pengines)).   55
   56:- setting(max_memory, integer, 8000,
   57           "Max memory used for cached data store (Mb)").

Cached data access

This module provides access to external data by caching it as a Prolog predicate. The data itself is kept in a global data module, so it is maintained over a SWISH Pengine invocation. */

   67:- meta_predicate
   68    data_source(:, +),
   69    data_record(:, -),
   70    record(:, -),
   71    data_row(:, -),
   72    data_row(:, +, +, -),
   73    data_dump(:, +, -),
   74    data_property(:, -).   75
   76:- multifile
   77    source/2.                           % +Term, -Goal
   78
   79
   80		 /*******************************
   81		 *          ADMIN DATA		*
   82		 *******************************/
   83
   84:- dynamic
   85    data_source_db/3,                   % Hash, Goal, Lock
   86    data_signature_db/2,                % Hash, Signature
   87    data_materialized/5,                % Hash, Materialized, SourceID, CPU, Wall
   88    data_last_access/3.                 % Hash, Time, Updates
   89
   90'data assert'(Term) :-
   91    assertz(Term).
 data materialized(+Hash, +Signature, +SourceVersion) is det
Called by a data plugin to indicate that loading the data has finished.
Arguments:
Hash- is the has of the original data source
Signature- is a term Hash(Arg1, Arg2, ...), where Arg1, ... are atoms or small integers that indicate the field names.
SourceVersion- is a term that indicates the identity of the source. this is typically a dict containing e.g., a time stamp, content hash, HTTP Etag value, etc.
  105'data materialized'(Hash, Signature, SourceVersion) :-
  106    statistics(cputime, CPU1),
  107    get_time(Now),
  108    nb_current('$data_source_materalize', stats(Time0, CPU0)),
  109    CPU  is CPU1 - CPU0,
  110    Wall is Now - Time0,
  111    assertz(data_signature_db(Hash, Signature)),
  112    assertz(data_materialized(Hash, Now, SourceVersion, CPU, Wall)).
  113
  114'data failed'(_Hash, Signature) :-
  115    functor(Signature, Name, Arity),
  116    functor(Generic, Name, Arity),
  117    retractall(Generic).
 data_source(:Id, +Source) is det
Create a data source Id from the source definition Source. Source definitions are plugin files loaded from swish(data).
  124data_source(M:Id, Source) :-
  125    variant_sha1(Source, Hash),
  126    data_source_db(Hash, Source, _),
  127    !,
  128    (   clause(M:'$data'(Id, Hash), true)
  129    ->  true
  130    ;   assertz(M:'$data'(Id, Hash))
  131    ).
  132data_source(M:Id, Source) :-
  133    valid_source(Source),
  134    variant_sha1(Source, Hash),
  135    mutex_create(Lock),
  136    assertz(data_source_db(Hash, Source, Lock)),
  137    assertz(M:'$data'(Id, Hash)).
 record(:Id, -Record) is nondet
 data_record(:Id, -Record) is nondet
True when Record is a dict representing a row in the dataset identified by Id.
deprecated
- record/2 is deprecated. New code should use data_record/2.
  148record(Id, Record) :-
  149    data_record(Id, Record).
  150
  151data_record(M:Id, Record) :-
  152    data_hash(M:Id, Hash),
  153    materialize(Hash),
  154    data_signature_db(Hash, Signature),
  155    data_record(Signature, Id, Record, Head),
  156    call(Head).
  157
  158data_record(Signature, Tag, Record, Head) :-
  159    Signature =.. [Name|Keys],
  160    pairs_keys_values(Pairs, Keys, Values),
  161    dict_pairs(Record, Tag, Pairs),
  162    Head =.. [Name|Values].
  163
  164data_hash(M:Id, Hash) :-
  165    clause(M:'$data'(Id, Hash), true),
  166    !.
  167data_hash(_:Id, _) :-
  168    existence_error(dataset, Id).
 data_row(:Id, -Row) is nondet
 data_row(:Id, +Range, +Header, -Row) is nondet
True when Row is a term Id(Arg, ...), where the first row contains the column names.
Arguments:
Header- If true, include a header row.
See also
- data_dump/3 to return a table and for a description of Range.
  179data_row(Id, Row) :-
  180    data_row(Id, all, true, Row).
  181
  182data_row(M:Id, Range, Header, Row) :-
  183    must_be(boolean, Header),
  184    data_hash(M:Id, Hash),
  185    materialize(Hash),
  186    data_signature_db(Hash, Signature),
  187    Signature =.. [_|ColNames],
  188    same_length(ColNames, Vars),
  189    Goal =.. [Hash|Vars],
  190    Row  =.. [Id|Vars],
  191    (   Header == true,
  192        Vars = ColNames
  193    ;   range(Range, M:Id, Goal)
  194    ).
  195
  196range(all, _Id, Goal) :-
  197    !,
  198    call(Goal).
  199range(From-To, _Id, Goal) :-
  200    !,
  201    Skip is From - 1,
  202    Size is To-Skip,
  203    limit(Size, offset(Skip, call(Goal))).
  204range(Limit, _Id, Goal) :-
  205    Limit >= 0,
  206    !,
  207    limit(Limit, call(Goal)).
  208range(Limit, Id, Goal) :-
  209    Limit < 0,
  210    data_property(Id, rows(Rows)),
  211    Skip is Rows+Limit,
  212    offset(Skip, call(Goal)).
 data_dump(:Id, +Range, -Table) is det
Table is a list of rows in the indicated range. This cooperates with the table rendering to produce a data table. Range is one of:
all
All rows from the data are included. Be careful if these are many as it is likely to make your browser very slow.
From - To
List the (1-based) rows From to To
Count
If Count >= 0, list the first, else list the last Count rows.
  228data_dump(Id, Range, Table) :-
  229    findall(Row, data_row(Id, Range, true, Row), Table).
 data_property(:Id, ?Property) is nondet
True when Property is a known property about the data source Id. Defined properties are:
columns(-Count)
Number of columns in the table.
column_names(-Names)
Names is a list of the column names as they appear in the data.
rows(-Rows)
Number of rows in the table
hash(-Hash)
Get the internal (hashed) identifier for the data source
source_version(-SourceVersion)
A term (often a dict) that provides version information about the source. Details depend on the source.
materialized(-TimeStamp)
The data source was materialized at TimeStamp.
source(-Term)
Description of the original source term used to declare the data source
  255data_property(M:Id, Property) :-
  256    data_hash(M:Id, Hash),
  257    materialize(Hash),
  258    property(Property),
  259    property(Property, Hash).
  260
  261property(columns(_)).
  262property(column_names(_)).
  263property(rows(_)).
  264property(hash(_)).
  265property(source_version(_)).
  266property(materialized(_)).
  267property(source(_)).
  268
  269property(columns(Count), Hash) :-
  270    data_signature_db(Hash, Signature),
  271    functor(Signature, _, Count).
  272property(column_names(Names), Hash) :-
  273    data_signature_db(Hash, Signature),
  274    Signature =.. [_|Names].
  275property(rows(Count), Hash) :-
  276    data_signature_db(Hash, Signature),
  277    predicate_property(Signature, number_of_clauses(Count)).
  278property(hash(Hash), Hash).
  279property(source_version(SourceVersion), Hash) :-
  280    data_materialized(Hash, _, SourceVersion, _, _).
  281property(materialized(TimeStamp), Hash) :-
  282    data_materialized(Hash, TimeStamp, _, _, _).
  283property(source(SourceTerm), Hash) :-
  284    data_source_db(Hash, SourceTerm, _Lock).
 swish:goal_expansion(+Dict, -DataGoal)
Translate a Dict where the tag is the identifier of a data source and the keys are columns pf this source into a goal on the data. Note that the data itself is represented as a Prolog predicate, representing each row as a fact and each column as an argument.
  293:- multifile
  294    swish:goal_expansion/2.  295
  296swish:goal_expansion(Dict, swish_data_source:Head) :-
  297    is_dict(Dict, Id),
  298    prolog_load_context(module, M),
  299    clause(M:'$data'(Id, Hash), true),
  300    materialize(Hash),
  301    data_signature_db(Hash, Signature),
  302    data_record(Signature, Id, Record, Head),
  303    Dict :< Record.
  304
  305
  306		 /*******************************
  307		 *       DATA MANAGEMENT	*
  308		 *******************************/
  309
  310valid_source(Source) :-
  311    must_be(nonvar, Source),
  312    source(Source, _Goal),
  313    !.
  314valid_source(Source) :-
  315    existence_error(data_source, Source).
 materialize(+Hash)
Materialise the data identified by Hash. The materialization goal should
  331materialize(Hash) :-
  332    must_be(atom, Hash),
  333    data_materialized(Hash, _When, _From, _CPU, _Wall),
  334    !,
  335    update_last_access(Hash).
  336materialize(Hash) :-
  337    data_source_db(Hash, Source, Lock),
  338    update_last_access(Hash),
  339    gc_data,
  340    with_mutex(Lock, materialize_sync(Hash, Source)).
  341
  342materialize_sync(Hash, _Source) :-
  343    data_materialized(Hash, _When, _From, _CPU, _Wall),
  344    !.
  345materialize_sync(Hash, Source) :-
  346    source(Source, Goal),
  347    get_time(Time0),
  348    statistics(cputime, CPU0),
  349    setup_call_cleanup(
  350        b_setval('$data_source_materalize', stats(Time0, CPU0)),
  351        call(Goal, Hash),
  352        nb_delete('$data_source_materalize')),
  353    data_signature_db(Hash, Head),
  354    functor(Head, Name, Arity),
  355    public(Name/Arity).
  356
  357
  358		 /*******************************
  359		 *              GC		*
  360		 *******************************/
 update_last_access(+Hash) is det
Update the last known access time. The value is rounded down to 1 minute to reduce database updates.
  367update_last_access(Hash) :-
  368    get_time(Now),
  369    Rounded is floor(Now/60)*60,
  370    (   data_last_access(Hash, Rounded, _)
  371    ->  true
  372    ;   clause(data_last_access(Hash, _, C0), true, Old)
  373    ->  C is C0+1,
  374        asserta(data_last_access(Hash, Rounded, C)),
  375        erase(Old)
  376    ;   asserta(data_last_access(Hash, Rounded, 1))
  377    ).
  378
  379gc_stats(Hash, _{ hash:Hash,
  380                  materialized:When, cpu:CPU, wall:Wall,
  381                  bytes:Size,
  382                  last_accessed_ago:Ago,
  383                  access_frequency:AccessCount
  384                }) :-
  385    data_materialized(Hash, When, _From, CPU, Wall),
  386    data_signature_db(Hash, Signature),
  387    data_last_access(Hash, Last, AccessCount),
  388    get_time(Now),
  389    Ago is floor(Now/60)*60-Last,
  390    predicate_property(Signature, number_of_clauses(Count)),
  391    functor(Signature, _, Arity),
  392    Size is (88+(16*Arity))*Count.
 gc_data is det
 gc_data(+MaxSize) is det
Remove the last unused data set until memory of this module drops below MaxSize. The predicate gc_data/0 is called before materializing a data source.
  402gc_data :-
  403    setting(max_memory, MB),
  404    Bytes is MB*1024*1024,
  405    gc_data(Bytes),
  406    set_module(program_space(Bytes)).
  407
  408gc_data(MaxSize) :-
  409    module_property(swish_data_source, program_size(Size)),
  410    Size < MaxSize,
  411    !.
  412gc_data(MaxSize) :-
  413    findall(Stat, gc_stats(_, Stat), Stats),
  414    sort(last_accessed_ago, >=, Stats, ByTime),
  415    member(Stat, ByTime),
  416       data_flush(ByTime.hash),
  417       module_property(swish_data_source, program_size(Size)),
  418       Size < MaxSize,
  419    !.
  420gc_data(_).
 data_flush(+Hash)
Drop the data associated with hash
  427data_flush(Hash) :-
  428    data_signature_db(Hash, Signature),
  429    data_record(Signature, _Id, _Record, Head),
  430    retractall(Head),
  431    retractall(data_signature_db(Hash, Head)),
  432    retractall(data_materialized(Hash, _When1, _From, _CPU, _Wall)),
  433    retractall(data_last_access(Hash, _When2, _Count)).
  434
  435
  436		 /*******************************
  437		 *            SANDBOX		*
  438		 *******************************/
  439
  440:- multifile
  441    sandbox:safe_meta/2.  442
  443sandbox:safe_meta(swish_data_source:data_source(Id,_), [])     :- safe_id(Id).
  444sandbox:safe_meta(swish_data_source:data_record(Id,_), [])     :- safe_id(Id).
  445sandbox:safe_meta(swish_data_source:record(Id,_), [])          :- safe_id(Id).
  446sandbox:safe_meta(swish_data_source:data_row(Id,_), [])        :- safe_id(Id).
  447sandbox:safe_meta(swish_data_source:data_row(Id,_,_,_), [])    :- safe_id(Id).
  448sandbox:safe_meta(swish_data_source:data_dump(Id,_,_), [])     :- safe_id(Id).
  449sandbox:safe_meta(swish_data_source:data_property(Id,_), [])   :- safe_id(Id).
  450
  451safe_id(M:_) :- !, pengine_self(M).
  452safe_id(_)