Natools

Check-in [24c6ae742e]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:smaz_tools: add a dictionary-independent version of the dynamic hashes
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 24c6ae742efb13e43fefbc7240871102d409b701
User & Date: nat 2016-11-23 21:10:58
Context
2016-11-24
21:01
smaz_tools: add a Dummy_Hash function for dictionary-specific tools check-in: 4aa72a0c16 user: nat tags: trunk
2016-11-23
21:10
smaz_tools: add a dictionary-independent version of the dynamic hashes check-in: 24c6ae742e user: nat tags: trunk
2016-11-22
20:04
smaz_tools: new package for dictionary-independent tools check-in: b87eafd22c user: nat tags: trunk
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to src/natools-smaz_tools.adb.

9
10
11
12
13
14
15
16


17
18
19







































































































20
21
22
23
24
25
26
-- WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF         --
-- MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR  --
-- ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES   --
-- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN    --
-- ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF  --
-- OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.           --
------------------------------------------------------------------------------



package body Natools.Smaz_Tools is

   package Sx renames Natools.S_Expressions;









































































































   ----------------------
   -- Public Interface --
   ----------------------

   procedure Read_List








>
>



>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
-- WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF         --
-- MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR  --
-- ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES   --
-- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN    --
-- ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF  --
-- OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.           --
------------------------------------------------------------------------------

with Ada.Unchecked_Deallocation;

package body Natools.Smaz_Tools is

   package Sx renames Natools.S_Expressions;

   function Build_Node
     (Map : Dictionary_Maps.Map;
      Empty_Value : Natural)
     return Trie_Node;

   procedure Set_Map
     (Map : in out Dictionary_Maps.Map;
      List : in String_Lists.List);
      --  Set Map contents to match List by index number

   procedure Free is new Ada.Unchecked_Deallocation
     (Trie_Node, Trie_Node_Access);


   ------------------------------
   -- Local Helper Subprograms --
   ------------------------------

   function Build_Node
     (Map : Dictionary_Maps.Map;
      Empty_Value : Natural)
     return Trie_Node
   is
      function First_Character (S : String) return Character
        is (S (S'First));
      function Is_Current (Cursor : Dictionary_Maps.Cursor; C : Character)
        return Boolean
        is (Dictionary_Maps.Has_Element (Cursor)
            and then First_Character (Dictionary_Maps.Key (Cursor)) = C);

      function Suffix (S : String) return String;

      function Suffix (S : String) return String is
      begin
         return S (S'First + 1 .. S'Last);
      end Suffix;

      use type Ada.Containers.Count_Type;
      Cursor : Dictionary_Maps.Cursor;
      Result : Trie_Node
        := (Ada.Finalization.Controlled with
            Is_Leaf => False,
            Index => Empty_Value,
            Children => (others => null));
   begin
      pragma Assert (Dictionary_Maps.Length (Map) >= 1);

      Cursor := Dictionary_Maps.Find (Map, "");

      if Dictionary_Maps.Has_Element (Cursor) then
         Result.Index := Dictionary_Maps.Element (Cursor);
      end if;

      for C in Character'Range loop
         Cursor := Dictionary_Maps.Ceiling (Map, (1 => C));

         if Is_Current (Cursor, C) then
            if not Is_Current (Dictionary_Maps.Next (Cursor), C)
              and then Dictionary_Maps.Key (Cursor) = (1 => C)
            then
               Result.Children (C)
                 := new Trie_Node'(Ada.Finalization.Controlled with
                     Is_Leaf => True,
                     Index => Dictionary_Maps.Element (Cursor));
            else
               declare
                  New_Map : Dictionary_Maps.Map;
               begin
                  loop
                     Dictionary_Maps.Insert
                       (New_Map,
                        Suffix (Dictionary_Maps.Key (Cursor)),
                        Dictionary_Maps.Element (Cursor));
                     Dictionary_Maps.Next (Cursor);
                     exit when not Is_Current (Cursor, C);
                  end loop;

                  Result.Children (C)
                    := new Trie_Node'(Build_Node (New_Map, Empty_Value));
               end;
            end if;
         end if;
      end loop;

      return Result;
   end Build_Node;


   procedure Set_Map
     (Map : in out Dictionary_Maps.Map;
      List : in String_Lists.List)
   is
      I : Natural := 0;
   begin
      Dictionary_Maps.Clear (Map);

      for S of List loop
         Dictionary_Maps.Insert (Map, S, I);
         I := I + 1;
      end loop;
   end Set_Map;



   ----------------------
   -- Public Interface --
   ----------------------

   procedure Read_List
49
50
51
52
53
54
55















































































































56
57
58
59
60
61
62
               exit Read_Loop;
         end case;

         Descriptor.Next (Event);
      end loop Read_Loop;
   end Read_List;


















































































































   -------------------
   -- Word Counting --
   -------------------

   procedure Add_Substrings







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
               exit Read_Loop;
         end case;

         Descriptor.Next (Event);
      end loop Read_Loop;
   end Read_List;



   ---------------------------------
   -- Dynamic Dictionary Searches --
   ---------------------------------

   overriding procedure Adjust (Node : in out Trie_Node) is
   begin
      if not Node.Is_Leaf then
         for C in Node.Children'Range loop
            if Node.Children (C) /= null then
               Node.Children (C) := new Trie_Node'(Node.Children (C).all);
            end if;
         end loop;
      end if;
   end Adjust;


   overriding procedure Finalize (Node : in out Trie_Node) is
   begin
      if not Node.Is_Leaf then
         for C in Node.Children'Range loop
            Free (Node.Children (C));
         end loop;
      end if;
   end Finalize;


   procedure Initialize
     (Trie : out Search_Trie;
      List : in String_Lists.List)
   is
      Map : Dictionary_Maps.Map;
      Not_Found : constant Natural := Natural (String_Lists.Length (List));
   begin
      Set_Map (Map, List);

      Trie := (Not_Found => Not_Found,
               Root => Build_Node (Map, Not_Found));
   end Initialize;


   function Linear_Search (Value : String) return Natural is
      Result : Natural := 0;
   begin
      for S of List_For_Linear_Search loop
         exit when S = Value;
         Result := Result + 1;
      end loop;

      return Result;
   end Linear_Search;


   function Map_Search (Value : String) return Natural is
      Cursor : constant Dictionary_Maps.Cursor
        := Dictionary_Maps.Find (Search_Map, Value);
   begin
      if Dictionary_Maps.Has_Element (Cursor) then
         return Natural (Dictionary_Maps.Element (Cursor));
      else
         return Natural (Dictionary_Maps.Length (Search_Map));
      end if;
   end Map_Search;


   function Search (Trie : in Search_Trie; Value : in String) return Natural is
      Index : Positive := Value'First;
      Position : Trie_Node_Access;
   begin
      if Value'Length = 0 then
         return Trie.Not_Found;
      end if;

      Position := Trie.Root.Children (Value (Index));

      loop
         if Position = null then
            return Trie.Not_Found;
         end if;

         Index := Index + 1;

         if Index not in Value'Range then
            return Position.Index;
         elsif Position.Is_Leaf then
            return Trie.Not_Found;
         end if;

         Position := Position.Children (Value (Index));
      end loop;
   end Search;


   procedure Set_Dictionary_For_Map_Search (List : in String_Lists.List) is
   begin
      Set_Map (Search_Map, List);
   end Set_Dictionary_For_Map_Search;


   procedure Set_Dictionary_For_Trie_Search (List : in String_Lists.List) is
   begin
      Initialize (Trie_For_Search, List);
   end Set_Dictionary_For_Trie_Search;


   function Trie_Search (Value : String) return Natural is
   begin
      return Search (Trie_For_Search, Value);
   end Trie_Search;



   -------------------
   -- Word Counting --
   -------------------

   procedure Add_Substrings

Changes to src/natools-smaz_tools.ads.

22
23
24
25
26
27
28

29
30
31
32
33
34
35
36
37
38
39
40






















41
42
43
44
45
46
47
------------------------------------------------------------------------------

with Ada.Containers.Indefinite_Doubly_Linked_Lists;
with Natools.S_Expressions;

private with Ada.Containers.Indefinite_Ordered_Maps;
private with Ada.Containers.Indefinite_Ordered_Sets;


package Natools.Smaz_Tools is
   pragma Preelaborate;

   package String_Lists is new Ada.Containers.Indefinite_Doubly_Linked_Lists
     (String);

   procedure Read_List
     (List : out String_Lists.List;
      Descriptor : in out S_Expressions.Descriptor'Class);
      --  Read atoms from Descriptor to fill List
























   type String_Count is range 0 .. 2 ** 31 - 1;
      --  Type for a number of substring occurrences

   package Methods is
      type Enum is (Encoded, Frequency, Gain);
   end Methods;







>












>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
------------------------------------------------------------------------------

with Ada.Containers.Indefinite_Doubly_Linked_Lists;
with Natools.S_Expressions;

private with Ada.Containers.Indefinite_Ordered_Maps;
private with Ada.Containers.Indefinite_Ordered_Sets;
private with Ada.Finalization;

package Natools.Smaz_Tools is
   pragma Preelaborate;

   package String_Lists is new Ada.Containers.Indefinite_Doubly_Linked_Lists
     (String);

   procedure Read_List
     (List : out String_Lists.List;
      Descriptor : in out S_Expressions.Descriptor'Class);
      --  Read atoms from Descriptor to fill List


   List_For_Linear_Search : String_Lists.List;
   function Linear_Search (Value : String) return Natural;
      --  Function and data source for inefficient but dynamic function
      --  that can be used with Dictionary.Hash.

   procedure Set_Dictionary_For_Map_Search (List : in String_Lists.List);
   function Map_Search (Value : String) return Natural;
      --  Function and data source for logarithmic search using standard
      --  ordered map, that can be used with Dictionary.Hash.

   type Search_Trie is private;
   procedure Initialize (Trie : out Search_Trie; List : in String_Lists.List);
   function Search (Trie : in Search_Trie; Value : in String) return Natural;
      --  Trie-based search in a dynamic dictionary, for lookup whose
      --  speed-vs-memory is even more skewed towards speed.

   procedure Set_Dictionary_For_Trie_Search (List : in String_Lists.List);
   function Trie_Search (Value : String) return Natural;
      --  Function and data source for trie-based search that can be
      --  used with Dictionary.Hash.


   type String_Count is range 0 .. 2 ** 31 - 1;
      --  Type for a number of substring occurrences

   package Methods is
      type Enum is (Encoded, Frequency, Gain);
   end Methods;
149
150
151
152
153
154
155
156





























157
   function To_Scored_Word
     (Cursor : in Word_Maps.Cursor;
      Method : in Methods.Enum)
     return Scored_Word;

   package Scored_Word_Sets is new Ada.Containers.Indefinite_Ordered_Sets
     (Scored_Word);






























end Natools.Smaz_Tools;








>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
   function To_Scored_Word
     (Cursor : in Word_Maps.Cursor;
      Method : in Methods.Enum)
     return Scored_Word;

   package Scored_Word_Sets is new Ada.Containers.Indefinite_Ordered_Sets
     (Scored_Word);

   package Dictionary_Maps is new Ada.Containers.Indefinite_Ordered_Maps
     (String, Natural);

   Search_Map : Dictionary_Maps.Map;

   type Trie_Node;
   type Trie_Node_Access is access Trie_Node;
   type Trie_Node_Array is array (Character) of Trie_Node_Access;

   type Trie_Node (Is_Leaf : Boolean) is new Ada.Finalization.Controlled
     with record
      Index : Natural;

      case Is_Leaf is
         when True  => null;
         when False => Children : Trie_Node_Array;
      end case;
   end record;

   overriding procedure Adjust (Node : in out Trie_Node);
   overriding procedure Finalize (Node : in out Trie_Node);

   type Search_Trie is record
      Not_Found : Natural;
      Root : Trie_Node (False);
   end record;

   Trie_For_Search : Search_Trie;

end Natools.Smaz_Tools;