Natools

Check-in [b87eafd22c]
Login
Overview
Comment:smaz_tools: new package for dictionary-independent tools
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: b87eafd22cc04b43af9c75a5c9f2c3e907a17a5b
User & Date: nat on 2016-11-22 20:04:55
Other Links: manifest | tags
Context
2016-11-23
21:10
smaz_tools: add a dictionary-independent version of the dynamic hashes check-in: 24c6ae742e user: nat tags: trunk
2016-11-22
20:04
smaz_tools: new package for dictionary-independent tools check-in: b87eafd22c user: nat tags: trunk
2016-11-21
20:40
tools/smaz: fix direct dictionaries ignoring variable-length config check-in: b141a142f0 user: nat tags: trunk
Changes

Added src/natools-smaz_tools.adb version [3e624ef1b3].

































































































































































































































































































































































































































































































>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
------------------------------------------------------------------------------
-- Copyright (c) 2016, Natacha Porté                                        --
--                                                                          --
-- Permission to use, copy, modify, and distribute this software for any    --
-- purpose with or without fee is hereby granted, provided that the above   --
-- copyright notice and this permission notice appear in all copies.        --
--                                                                          --
-- THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES --
-- WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF         --
-- MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR  --
-- ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES   --
-- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN    --
-- ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF  --
-- OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.           --
------------------------------------------------------------------------------

package body Natools.Smaz_Tools is

   package Sx renames Natools.S_Expressions;


   ----------------------
   -- Public Interface --
   ----------------------

   procedure Read_List
     (List : out String_Lists.List;
      Descriptor : in out S_Expressions.Descriptor'Class)
   is
      use type Sx.Events.Event;
      Event : Sx.Events.Event := Descriptor.Current_Event;
   begin
      String_Lists.Clear (List);

      if Event = Sx.Events.Open_List then
         Descriptor.Next (Event);
      end if;

      Read_Loop :
      loop
         case Event is
            when Sx.Events.Add_Atom =>
               String_Lists.Append
                 (List, Sx.To_String (Descriptor.Current_Atom));
            when Sx.Events.Open_List =>
               Descriptor.Close_Current_List;
            when Sx.Events.End_Of_Input | Sx.Events.Error
              | Sx.Events.Close_List =>
               exit Read_Loop;
         end case;

         Descriptor.Next (Event);
      end loop Read_Loop;
   end Read_List;



   -------------------
   -- Word Counting --
   -------------------

   procedure Add_Substrings
     (Counter : in out Word_Counter;
      Phrase : in String;
      Min_Size : in Positive;
      Max_Size : in Positive) is
   begin
      for First in Phrase'First .. Phrase'Last - Min_Size + 1 loop
         for Last in First + Min_Size - 1
           .. Natural'Min (First + Max_Size - 1, Phrase'Last)
         loop
            Add_Word (Counter, Phrase (First .. Last));
         end loop;
      end loop;
   end Add_Substrings;


   procedure Add_Word
     (Counter : in out Word_Counter;
      Word : in String;
      Count : in String_Count := 1)
   is
      procedure Update
        (Key : in String; Element : in out String_Count);

      procedure Update
        (Key : in String; Element : in out String_Count)
      is
         pragma Unreferenced (Key);
      begin
         Element := Element + Count;
      end Update;

      Cursor : constant Word_Maps.Cursor := Word_Maps.Find (Counter.Map, Word);
   begin
      if Word_Maps.Has_Element (Cursor) then
         Word_Maps.Update_Element (Counter.Map, Cursor, Update'Access);
      else
         Word_Maps.Insert (Counter.Map, Word, Count);
      end if;
   end Add_Word;


   procedure Add_Words
     (Counter : in out Word_Counter;
      Phrase : in String;
      Min_Size : in Positive;
      Max_Size : in Positive)
   is
      subtype Word_Part is Character with Static_Predicate
        => Word_Part in '0' .. '9' | 'A' .. 'Z' | 'a' .. 'z'
                      | Character'Val (128) .. Character'Val (255);
      I, First, Next : Positive;
   begin
      if Max_Size < Min_Size then
         return;
      end if;

      I := Phrase'First;

      Main_Loop :
      while I in Phrase'Range loop
         Skip_Non_Word :
         while I in Phrase'Range and then Phrase (I) not in Word_Part loop
            I := I + 1;
         end loop Skip_Non_Word;

         exit Main_Loop when I not in Phrase'Range;
         First := I;

         Skip_Word :
         while I in Phrase'Range and then Phrase (I) in Word_Part loop
            I := I + 1;
         end loop Skip_Word;

         Next := I;

         if Next - First in Min_Size .. Max_Size then
            Add_Word (Counter, Phrase (First .. Next - 1));
         end if;
      end loop Main_Loop;
   end Add_Words;


   procedure Filter_By_Count
     (Counter : in out Word_Counter;
      Threshold_Count : in String_Count)
   is
      Position, Next : Word_Maps.Cursor;
   begin
      Position := Word_Maps.First (Counter.Map);

      while Word_Maps.Has_Element (Position) loop
         Next := Word_Maps.Next (Position);

         if Word_Maps.Element (Position) < Threshold_Count then
            Word_Maps.Delete (Counter.Map, Position);
         end if;

         Position := Next;
      end loop;

      pragma Assert (for all Count of Counter.Map => Count >= Threshold_Count);
   end Filter_By_Count;


   function Simple_Dictionary
     (Counter : in Word_Counter;
      Word_Count : in Natural;
      Method : in Methods.Enum := Methods.Encoded)
     return String_Lists.List
   is
      use type Ada.Containers.Count_Type;
      Target_Count : constant Ada.Containers.Count_Type
        := Ada.Containers.Count_Type (Word_Count);
      Set : Scored_Word_Sets.Set;
      Result : String_Lists.List;
   begin
      for Cursor in Word_Maps.Iterate (Counter.Map) loop
         Scored_Word_Sets.Include (Set, To_Scored_Word (Cursor, Method));

         if Scored_Word_Sets.Length (Set) > Target_Count then
            Scored_Word_Sets.Delete_Last (Set);
         end if;
      end loop;

      for Cursor in Scored_Word_Sets.Iterate (Set) loop
         Result.Append (Scored_Word_Sets.Element (Cursor).Word);
      end loop;

      return Result;
   end Simple_Dictionary;


   procedure Simple_Dictionary_And_Pending
     (Counter : in Word_Counter;
      Word_Count : in Natural;
      Selected : out String_Lists.List;
      Pending : out String_Lists.List;
      Method : in Methods.Enum := Methods.Encoded;
      Max_Pending_Count : in Ada.Containers.Count_Type
        := Ada.Containers.Count_Type'Last)
   is
      use type Ada.Containers.Count_Type;
      Target_Count : constant Ada.Containers.Count_Type
        := Ada.Containers.Count_Type (Word_Count);
      Set : Scored_Word_Sets.Set;
   begin
      for Cursor in Word_Maps.Iterate (Counter.Map) loop
         Scored_Word_Sets.Insert (Set, To_Scored_Word (Cursor, Method));
      end loop;

      Selected := String_Lists.Empty_List;
      Pending := String_Lists.Empty_List;

      for Cursor in Scored_Word_Sets.Iterate (Set) loop
         if String_Lists.Length (Selected) < Target_Count then
            Selected.Append (Scored_Word_Sets.Element (Cursor).Word);
         else
            Pending.Append (Scored_Word_Sets.Element (Cursor).Word);
            exit when String_Lists.Length (Selected) >= Max_Pending_Count;
         end if;
      end loop;
   end Simple_Dictionary_And_Pending;


   function To_Scored_Word
     (Cursor : in Word_Maps.Cursor;
      Method : in Methods.Enum)
     return Scored_Word
   is
      Word : constant String := Word_Maps.Key (Cursor);
   begin
      return Scored_Word'
        (Size => Word'Length,
         Word => Word,
         Score => Score (Word_Maps.Element (Cursor), Word'Length, Method));
   end To_Scored_Word;

end Natools.Smaz_Tools;

Added src/natools-smaz_tools.ads version [ca9e37f8da].



























































































































































































































































































































>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
------------------------------------------------------------------------------
-- Copyright (c) 2016, Natacha Porté                                        --
--                                                                          --
-- Permission to use, copy, modify, and distribute this software for any    --
-- purpose with or without fee is hereby granted, provided that the above   --
-- copyright notice and this permission notice appear in all copies.        --
--                                                                          --
-- THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES --
-- WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF         --
-- MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR  --
-- ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES   --
-- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN    --
-- ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF  --
-- OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.           --
------------------------------------------------------------------------------

------------------------------------------------------------------------------
-- Natools.Smaz_Tools provides dictionary-independant tools to deal with    --
-- word lists and prepare dictionary creation.                              --
-- Note that the dictionary is intended to be generated and hard-coded,     --
-- so the final client shouldn't need this package.                         --
------------------------------------------------------------------------------

with Ada.Containers.Indefinite_Doubly_Linked_Lists;
with Natools.S_Expressions;

private with Ada.Containers.Indefinite_Ordered_Maps;
private with Ada.Containers.Indefinite_Ordered_Sets;

package Natools.Smaz_Tools is
   pragma Preelaborate;

   package String_Lists is new Ada.Containers.Indefinite_Doubly_Linked_Lists
     (String);

   procedure Read_List
     (List : out String_Lists.List;
      Descriptor : in out S_Expressions.Descriptor'Class);
      --  Read atoms from Descriptor to fill List


   type String_Count is range 0 .. 2 ** 31 - 1;
      --  Type for a number of substring occurrences

   package Methods is
      type Enum is (Encoded, Frequency, Gain);
   end Methods;
      --  Evaluation methods to select words to remove or include

   type Word_Counter is private;
      --  Accumulate frequency/occurrence counts for a set of strings

   procedure Add_Word
     (Counter : in out Word_Counter;
      Word : in String;
      Count : in String_Count := 1);
      --  Include Count number of occurrences of Word in Counter

   procedure Add_Substrings
     (Counter : in out Word_Counter;
      Phrase : in String;
      Min_Size : in Positive;
      Max_Size : in Positive);
      --  Include all the substrings of Phrase whose lengths are
      --  between Min_Size and Max_Size.

   procedure Add_Words
     (Counter : in out Word_Counter;
      Phrase : in String;
      Min_Size : in Positive;
      Max_Size : in Positive);
      --  Add the "words" from Phrase into Counter, with a word being currently
      --  defined as anything between ASCII blanks or punctuation,
      --  or in other words [0-9A-Za-z\x80-\xFF]+

   procedure Filter_By_Count
     (Counter : in out Word_Counter;
      Threshold_Count : in String_Count);
      --  Remove from Counter all entries whose count is below the threshold

   function Simple_Dictionary
     (Counter : in Word_Counter;
      Word_Count : in Natural;
      Method : in Methods.Enum := Methods.Encoded)
     return String_Lists.List;
      --  Return the Word_Count words in Counter that have the highest score,
      --  the score being count * length.

   procedure Simple_Dictionary_And_Pending
     (Counter : in Word_Counter;
      Word_Count : in Natural;
      Selected : out String_Lists.List;
      Pending : out String_Lists.List;
      Method : in Methods.Enum := Methods.Encoded;
      Max_Pending_Count : in Ada.Containers.Count_Type
        := Ada.Containers.Count_Type'Last);
      --  Return in Selected the Word_Count words in Counter that have the
      --  highest score, and in Pending the remaining words,
      --  the score being count * length.


   type Score_Value is range 0 .. 2 ** 31 - 1;

   function Score_Encoded
     (Count : in String_Count; Length : in Positive) return Score_Value
     is (Score_Value (Count) * Score_Value (Length));
      --  Score value using the amount of encoded data by the element

   function Score_Frequency
     (Count : in String_Count; Length : in Positive) return Score_Value
     is (Score_Value (Count));
      --  Score value using the number of times the element was used

   function Score_Gain
     (Count : in String_Count; Length : in Positive) return Score_Value
     is (Score_Value (Count) * (Score_Value (Length) - 1));
      --  Score value using the number of bytes saved using the element

   function Score
     (Count : in String_Count;
      Length : in Positive;
      Method : in Methods.Enum)
     return Score_Value
     is (case Method is
         when Methods.Encoded => Score_Encoded (Count, Length),
         when Methods.Frequency => Score_Frequency (Count, Length),
         when Methods.Gain => Score_Gain (Count, Length));
      --  Scare value with dynamically chosen method

private

   package Word_Maps is new Ada.Containers.Indefinite_Ordered_Maps
     (String, String_Count);

   type Word_Counter is record
      Map : Word_Maps.Map;
   end record;


   type Scored_Word (Size : Natural) is record
      Word : String (1 .. Size);
      Score : Score_Value;
   end record;

   function "<" (Left, Right : Scored_Word) return Boolean
     is (Left.Score > Right.Score
         or else (Left.Score = Right.Score and then Left.Word < Right.Word));

   function To_Scored_Word
     (Cursor : in Word_Maps.Cursor;
      Method : in Methods.Enum)
     return Scored_Word;

   package Scored_Word_Sets is new Ada.Containers.Indefinite_Ordered_Sets
     (Scored_Word);

end Natools.Smaz_Tools;