Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
| Comment: | smaz_tools: new package for dictionary-independent tools |
|---|---|
| Timelines: | family | ancestors | descendants | both | trunk |
| Files: | files | file ages | folders |
| SHA1: |
b87eafd22cc04b43af9c75a5c9f2c3e9 |
| User & Date: | nat 2016-11-22 20:04:55.089 |
Context
|
2016-11-23
| ||
| 21:10 | smaz_tools: add a dictionary-independent version of the dynamic hashes check-in: 24c6ae742e user: nat tags: trunk | |
|
2016-11-22
| ||
| 20:04 | smaz_tools: new package for dictionary-independent tools check-in: b87eafd22c user: nat tags: trunk | |
|
2016-11-21
| ||
| 20:40 | tools/smaz: fix direct dictionaries ignoring variable-length config check-in: b141a142f0 user: nat tags: trunk | |
Changes
Added src/natools-smaz_tools.adb.
> > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 |
------------------------------------------------------------------------------
-- Copyright (c) 2016, Natacha Porté --
-- --
-- Permission to use, copy, modify, and distribute this software for any --
-- purpose with or without fee is hereby granted, provided that the above --
-- copyright notice and this permission notice appear in all copies. --
-- --
-- THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES --
-- WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF --
-- MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR --
-- ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES --
-- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN --
-- ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF --
-- OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. --
------------------------------------------------------------------------------
package body Natools.Smaz_Tools is
package Sx renames Natools.S_Expressions;
----------------------
-- Public Interface --
----------------------
procedure Read_List
(List : out String_Lists.List;
Descriptor : in out S_Expressions.Descriptor'Class)
is
use type Sx.Events.Event;
Event : Sx.Events.Event := Descriptor.Current_Event;
begin
String_Lists.Clear (List);
if Event = Sx.Events.Open_List then
Descriptor.Next (Event);
end if;
Read_Loop :
loop
case Event is
when Sx.Events.Add_Atom =>
String_Lists.Append
(List, Sx.To_String (Descriptor.Current_Atom));
when Sx.Events.Open_List =>
Descriptor.Close_Current_List;
when Sx.Events.End_Of_Input | Sx.Events.Error
| Sx.Events.Close_List =>
exit Read_Loop;
end case;
Descriptor.Next (Event);
end loop Read_Loop;
end Read_List;
-------------------
-- Word Counting --
-------------------
procedure Add_Substrings
(Counter : in out Word_Counter;
Phrase : in String;
Min_Size : in Positive;
Max_Size : in Positive) is
begin
for First in Phrase'First .. Phrase'Last - Min_Size + 1 loop
for Last in First + Min_Size - 1
.. Natural'Min (First + Max_Size - 1, Phrase'Last)
loop
Add_Word (Counter, Phrase (First .. Last));
end loop;
end loop;
end Add_Substrings;
procedure Add_Word
(Counter : in out Word_Counter;
Word : in String;
Count : in String_Count := 1)
is
procedure Update
(Key : in String; Element : in out String_Count);
procedure Update
(Key : in String; Element : in out String_Count)
is
pragma Unreferenced (Key);
begin
Element := Element + Count;
end Update;
Cursor : constant Word_Maps.Cursor := Word_Maps.Find (Counter.Map, Word);
begin
if Word_Maps.Has_Element (Cursor) then
Word_Maps.Update_Element (Counter.Map, Cursor, Update'Access);
else
Word_Maps.Insert (Counter.Map, Word, Count);
end if;
end Add_Word;
procedure Add_Words
(Counter : in out Word_Counter;
Phrase : in String;
Min_Size : in Positive;
Max_Size : in Positive)
is
subtype Word_Part is Character with Static_Predicate
=> Word_Part in '0' .. '9' | 'A' .. 'Z' | 'a' .. 'z'
| Character'Val (128) .. Character'Val (255);
I, First, Next : Positive;
begin
if Max_Size < Min_Size then
return;
end if;
I := Phrase'First;
Main_Loop :
while I in Phrase'Range loop
Skip_Non_Word :
while I in Phrase'Range and then Phrase (I) not in Word_Part loop
I := I + 1;
end loop Skip_Non_Word;
exit Main_Loop when I not in Phrase'Range;
First := I;
Skip_Word :
while I in Phrase'Range and then Phrase (I) in Word_Part loop
I := I + 1;
end loop Skip_Word;
Next := I;
if Next - First in Min_Size .. Max_Size then
Add_Word (Counter, Phrase (First .. Next - 1));
end if;
end loop Main_Loop;
end Add_Words;
procedure Filter_By_Count
(Counter : in out Word_Counter;
Threshold_Count : in String_Count)
is
Position, Next : Word_Maps.Cursor;
begin
Position := Word_Maps.First (Counter.Map);
while Word_Maps.Has_Element (Position) loop
Next := Word_Maps.Next (Position);
if Word_Maps.Element (Position) < Threshold_Count then
Word_Maps.Delete (Counter.Map, Position);
end if;
Position := Next;
end loop;
pragma Assert (for all Count of Counter.Map => Count >= Threshold_Count);
end Filter_By_Count;
function Simple_Dictionary
(Counter : in Word_Counter;
Word_Count : in Natural;
Method : in Methods.Enum := Methods.Encoded)
return String_Lists.List
is
use type Ada.Containers.Count_Type;
Target_Count : constant Ada.Containers.Count_Type
:= Ada.Containers.Count_Type (Word_Count);
Set : Scored_Word_Sets.Set;
Result : String_Lists.List;
begin
for Cursor in Word_Maps.Iterate (Counter.Map) loop
Scored_Word_Sets.Include (Set, To_Scored_Word (Cursor, Method));
if Scored_Word_Sets.Length (Set) > Target_Count then
Scored_Word_Sets.Delete_Last (Set);
end if;
end loop;
for Cursor in Scored_Word_Sets.Iterate (Set) loop
Result.Append (Scored_Word_Sets.Element (Cursor).Word);
end loop;
return Result;
end Simple_Dictionary;
procedure Simple_Dictionary_And_Pending
(Counter : in Word_Counter;
Word_Count : in Natural;
Selected : out String_Lists.List;
Pending : out String_Lists.List;
Method : in Methods.Enum := Methods.Encoded;
Max_Pending_Count : in Ada.Containers.Count_Type
:= Ada.Containers.Count_Type'Last)
is
use type Ada.Containers.Count_Type;
Target_Count : constant Ada.Containers.Count_Type
:= Ada.Containers.Count_Type (Word_Count);
Set : Scored_Word_Sets.Set;
begin
for Cursor in Word_Maps.Iterate (Counter.Map) loop
Scored_Word_Sets.Insert (Set, To_Scored_Word (Cursor, Method));
end loop;
Selected := String_Lists.Empty_List;
Pending := String_Lists.Empty_List;
for Cursor in Scored_Word_Sets.Iterate (Set) loop
if String_Lists.Length (Selected) < Target_Count then
Selected.Append (Scored_Word_Sets.Element (Cursor).Word);
else
Pending.Append (Scored_Word_Sets.Element (Cursor).Word);
exit when String_Lists.Length (Selected) >= Max_Pending_Count;
end if;
end loop;
end Simple_Dictionary_And_Pending;
function To_Scored_Word
(Cursor : in Word_Maps.Cursor;
Method : in Methods.Enum)
return Scored_Word
is
Word : constant String := Word_Maps.Key (Cursor);
begin
return Scored_Word'
(Size => Word'Length,
Word => Word,
Score => Score (Word_Maps.Element (Cursor), Word'Length, Method));
end To_Scored_Word;
end Natools.Smaz_Tools;
|
Added src/natools-smaz_tools.ads.
> > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
------------------------------------------------------------------------------
-- Copyright (c) 2016, Natacha Porté --
-- --
-- Permission to use, copy, modify, and distribute this software for any --
-- purpose with or without fee is hereby granted, provided that the above --
-- copyright notice and this permission notice appear in all copies. --
-- --
-- THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES --
-- WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF --
-- MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR --
-- ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES --
-- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN --
-- ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF --
-- OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. --
------------------------------------------------------------------------------
------------------------------------------------------------------------------
-- Natools.Smaz_Tools provides dictionary-independant tools to deal with --
-- word lists and prepare dictionary creation. --
-- Note that the dictionary is intended to be generated and hard-coded, --
-- so the final client shouldn't need this package. --
------------------------------------------------------------------------------
with Ada.Containers.Indefinite_Doubly_Linked_Lists;
with Natools.S_Expressions;
private with Ada.Containers.Indefinite_Ordered_Maps;
private with Ada.Containers.Indefinite_Ordered_Sets;
package Natools.Smaz_Tools is
pragma Preelaborate;
package String_Lists is new Ada.Containers.Indefinite_Doubly_Linked_Lists
(String);
procedure Read_List
(List : out String_Lists.List;
Descriptor : in out S_Expressions.Descriptor'Class);
-- Read atoms from Descriptor to fill List
type String_Count is range 0 .. 2 ** 31 - 1;
-- Type for a number of substring occurrences
package Methods is
type Enum is (Encoded, Frequency, Gain);
end Methods;
-- Evaluation methods to select words to remove or include
type Word_Counter is private;
-- Accumulate frequency/occurrence counts for a set of strings
procedure Add_Word
(Counter : in out Word_Counter;
Word : in String;
Count : in String_Count := 1);
-- Include Count number of occurrences of Word in Counter
procedure Add_Substrings
(Counter : in out Word_Counter;
Phrase : in String;
Min_Size : in Positive;
Max_Size : in Positive);
-- Include all the substrings of Phrase whose lengths are
-- between Min_Size and Max_Size.
procedure Add_Words
(Counter : in out Word_Counter;
Phrase : in String;
Min_Size : in Positive;
Max_Size : in Positive);
-- Add the "words" from Phrase into Counter, with a word being currently
-- defined as anything between ASCII blanks or punctuation,
-- or in other words [0-9A-Za-z\x80-\xFF]+
procedure Filter_By_Count
(Counter : in out Word_Counter;
Threshold_Count : in String_Count);
-- Remove from Counter all entries whose count is below the threshold
function Simple_Dictionary
(Counter : in Word_Counter;
Word_Count : in Natural;
Method : in Methods.Enum := Methods.Encoded)
return String_Lists.List;
-- Return the Word_Count words in Counter that have the highest score,
-- the score being count * length.
procedure Simple_Dictionary_And_Pending
(Counter : in Word_Counter;
Word_Count : in Natural;
Selected : out String_Lists.List;
Pending : out String_Lists.List;
Method : in Methods.Enum := Methods.Encoded;
Max_Pending_Count : in Ada.Containers.Count_Type
:= Ada.Containers.Count_Type'Last);
-- Return in Selected the Word_Count words in Counter that have the
-- highest score, and in Pending the remaining words,
-- the score being count * length.
type Score_Value is range 0 .. 2 ** 31 - 1;
function Score_Encoded
(Count : in String_Count; Length : in Positive) return Score_Value
is (Score_Value (Count) * Score_Value (Length));
-- Score value using the amount of encoded data by the element
function Score_Frequency
(Count : in String_Count; Length : in Positive) return Score_Value
is (Score_Value (Count));
-- Score value using the number of times the element was used
function Score_Gain
(Count : in String_Count; Length : in Positive) return Score_Value
is (Score_Value (Count) * (Score_Value (Length) - 1));
-- Score value using the number of bytes saved using the element
function Score
(Count : in String_Count;
Length : in Positive;
Method : in Methods.Enum)
return Score_Value
is (case Method is
when Methods.Encoded => Score_Encoded (Count, Length),
when Methods.Frequency => Score_Frequency (Count, Length),
when Methods.Gain => Score_Gain (Count, Length));
-- Scare value with dynamically chosen method
private
package Word_Maps is new Ada.Containers.Indefinite_Ordered_Maps
(String, String_Count);
type Word_Counter is record
Map : Word_Maps.Map;
end record;
type Scored_Word (Size : Natural) is record
Word : String (1 .. Size);
Score : Score_Value;
end record;
function "<" (Left, Right : Scored_Word) return Boolean
is (Left.Score > Right.Score
or else (Left.Score = Right.Score and then Left.Word < Right.Word));
function To_Scored_Word
(Cursor : in Word_Maps.Cursor;
Method : in Methods.Enum)
return Scored_Word;
package Scored_Word_Sets is new Ada.Containers.Indefinite_Ordered_Sets
(Scored_Word);
end Natools.Smaz_Tools;
|