Overview
Comment: | smaz_tools: new package for dictionary-independent tools |
---|---|
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA1: |
b87eafd22cc04b43af9c75a5c9f2c3e9 |
User & Date: | nat on 2016-11-22 20:04:55 |
Other Links: | manifest | tags |
Context
2016-11-23
| ||
21:10 | smaz_tools: add a dictionary-independent version of the dynamic hashes check-in: 24c6ae742e user: nat tags: trunk | |
2016-11-22
| ||
20:04 | smaz_tools: new package for dictionary-independent tools check-in: b87eafd22c user: nat tags: trunk | |
2016-11-21
| ||
20:40 | tools/smaz: fix direct dictionaries ignoring variable-length config check-in: b141a142f0 user: nat tags: trunk | |
Changes
Added src/natools-smaz_tools.adb version [3e624ef1b3].
> > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 | ------------------------------------------------------------------------------ -- Copyright (c) 2016, Natacha Porté -- -- -- -- Permission to use, copy, modify, and distribute this software for any -- -- purpose with or without fee is hereby granted, provided that the above -- -- copyright notice and this permission notice appear in all copies. -- -- -- -- THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -- -- WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -- -- MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR -- -- ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -- -- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -- -- ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -- -- OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -- ------------------------------------------------------------------------------ package body Natools.Smaz_Tools is package Sx renames Natools.S_Expressions; ---------------------- -- Public Interface -- ---------------------- procedure Read_List (List : out String_Lists.List; Descriptor : in out S_Expressions.Descriptor'Class) is use type Sx.Events.Event; Event : Sx.Events.Event := Descriptor.Current_Event; begin String_Lists.Clear (List); if Event = Sx.Events.Open_List then Descriptor.Next (Event); end if; Read_Loop : loop case Event is when Sx.Events.Add_Atom => String_Lists.Append (List, Sx.To_String (Descriptor.Current_Atom)); when Sx.Events.Open_List => Descriptor.Close_Current_List; when Sx.Events.End_Of_Input | Sx.Events.Error | Sx.Events.Close_List => exit Read_Loop; end case; Descriptor.Next (Event); end loop Read_Loop; end Read_List; ------------------- -- Word Counting -- ------------------- procedure Add_Substrings (Counter : in out Word_Counter; Phrase : in String; Min_Size : in Positive; Max_Size : in Positive) is begin for First in Phrase'First .. Phrase'Last - Min_Size + 1 loop for Last in First + Min_Size - 1 .. Natural'Min (First + Max_Size - 1, Phrase'Last) loop Add_Word (Counter, Phrase (First .. Last)); end loop; end loop; end Add_Substrings; procedure Add_Word (Counter : in out Word_Counter; Word : in String; Count : in String_Count := 1) is procedure Update (Key : in String; Element : in out String_Count); procedure Update (Key : in String; Element : in out String_Count) is pragma Unreferenced (Key); begin Element := Element + Count; end Update; Cursor : constant Word_Maps.Cursor := Word_Maps.Find (Counter.Map, Word); begin if Word_Maps.Has_Element (Cursor) then Word_Maps.Update_Element (Counter.Map, Cursor, Update'Access); else Word_Maps.Insert (Counter.Map, Word, Count); end if; end Add_Word; procedure Add_Words (Counter : in out Word_Counter; Phrase : in String; Min_Size : in Positive; Max_Size : in Positive) is subtype Word_Part is Character with Static_Predicate => Word_Part in '0' .. '9' | 'A' .. 'Z' | 'a' .. 'z' | Character'Val (128) .. Character'Val (255); I, First, Next : Positive; begin if Max_Size < Min_Size then return; end if; I := Phrase'First; Main_Loop : while I in Phrase'Range loop Skip_Non_Word : while I in Phrase'Range and then Phrase (I) not in Word_Part loop I := I + 1; end loop Skip_Non_Word; exit Main_Loop when I not in Phrase'Range; First := I; Skip_Word : while I in Phrase'Range and then Phrase (I) in Word_Part loop I := I + 1; end loop Skip_Word; Next := I; if Next - First in Min_Size .. Max_Size then Add_Word (Counter, Phrase (First .. Next - 1)); end if; end loop Main_Loop; end Add_Words; procedure Filter_By_Count (Counter : in out Word_Counter; Threshold_Count : in String_Count) is Position, Next : Word_Maps.Cursor; begin Position := Word_Maps.First (Counter.Map); while Word_Maps.Has_Element (Position) loop Next := Word_Maps.Next (Position); if Word_Maps.Element (Position) < Threshold_Count then Word_Maps.Delete (Counter.Map, Position); end if; Position := Next; end loop; pragma Assert (for all Count of Counter.Map => Count >= Threshold_Count); end Filter_By_Count; function Simple_Dictionary (Counter : in Word_Counter; Word_Count : in Natural; Method : in Methods.Enum := Methods.Encoded) return String_Lists.List is use type Ada.Containers.Count_Type; Target_Count : constant Ada.Containers.Count_Type := Ada.Containers.Count_Type (Word_Count); Set : Scored_Word_Sets.Set; Result : String_Lists.List; begin for Cursor in Word_Maps.Iterate (Counter.Map) loop Scored_Word_Sets.Include (Set, To_Scored_Word (Cursor, Method)); if Scored_Word_Sets.Length (Set) > Target_Count then Scored_Word_Sets.Delete_Last (Set); end if; end loop; for Cursor in Scored_Word_Sets.Iterate (Set) loop Result.Append (Scored_Word_Sets.Element (Cursor).Word); end loop; return Result; end Simple_Dictionary; procedure Simple_Dictionary_And_Pending (Counter : in Word_Counter; Word_Count : in Natural; Selected : out String_Lists.List; Pending : out String_Lists.List; Method : in Methods.Enum := Methods.Encoded; Max_Pending_Count : in Ada.Containers.Count_Type := Ada.Containers.Count_Type'Last) is use type Ada.Containers.Count_Type; Target_Count : constant Ada.Containers.Count_Type := Ada.Containers.Count_Type (Word_Count); Set : Scored_Word_Sets.Set; begin for Cursor in Word_Maps.Iterate (Counter.Map) loop Scored_Word_Sets.Insert (Set, To_Scored_Word (Cursor, Method)); end loop; Selected := String_Lists.Empty_List; Pending := String_Lists.Empty_List; for Cursor in Scored_Word_Sets.Iterate (Set) loop if String_Lists.Length (Selected) < Target_Count then Selected.Append (Scored_Word_Sets.Element (Cursor).Word); else Pending.Append (Scored_Word_Sets.Element (Cursor).Word); exit when String_Lists.Length (Selected) >= Max_Pending_Count; end if; end loop; end Simple_Dictionary_And_Pending; function To_Scored_Word (Cursor : in Word_Maps.Cursor; Method : in Methods.Enum) return Scored_Word is Word : constant String := Word_Maps.Key (Cursor); begin return Scored_Word' (Size => Word'Length, Word => Word, Score => Score (Word_Maps.Element (Cursor), Word'Length, Method)); end To_Scored_Word; end Natools.Smaz_Tools; |
Added src/natools-smaz_tools.ads version [ca9e37f8da].
> > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 | ------------------------------------------------------------------------------ -- Copyright (c) 2016, Natacha Porté -- -- -- -- Permission to use, copy, modify, and distribute this software for any -- -- purpose with or without fee is hereby granted, provided that the above -- -- copyright notice and this permission notice appear in all copies. -- -- -- -- THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -- -- WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -- -- MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR -- -- ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -- -- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -- -- ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -- -- OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -- ------------------------------------------------------------------------------ ------------------------------------------------------------------------------ -- Natools.Smaz_Tools provides dictionary-independant tools to deal with -- -- word lists and prepare dictionary creation. -- -- Note that the dictionary is intended to be generated and hard-coded, -- -- so the final client shouldn't need this package. -- ------------------------------------------------------------------------------ with Ada.Containers.Indefinite_Doubly_Linked_Lists; with Natools.S_Expressions; private with Ada.Containers.Indefinite_Ordered_Maps; private with Ada.Containers.Indefinite_Ordered_Sets; package Natools.Smaz_Tools is pragma Preelaborate; package String_Lists is new Ada.Containers.Indefinite_Doubly_Linked_Lists (String); procedure Read_List (List : out String_Lists.List; Descriptor : in out S_Expressions.Descriptor'Class); -- Read atoms from Descriptor to fill List type String_Count is range 0 .. 2 ** 31 - 1; -- Type for a number of substring occurrences package Methods is type Enum is (Encoded, Frequency, Gain); end Methods; -- Evaluation methods to select words to remove or include type Word_Counter is private; -- Accumulate frequency/occurrence counts for a set of strings procedure Add_Word (Counter : in out Word_Counter; Word : in String; Count : in String_Count := 1); -- Include Count number of occurrences of Word in Counter procedure Add_Substrings (Counter : in out Word_Counter; Phrase : in String; Min_Size : in Positive; Max_Size : in Positive); -- Include all the substrings of Phrase whose lengths are -- between Min_Size and Max_Size. procedure Add_Words (Counter : in out Word_Counter; Phrase : in String; Min_Size : in Positive; Max_Size : in Positive); -- Add the "words" from Phrase into Counter, with a word being currently -- defined as anything between ASCII blanks or punctuation, -- or in other words [0-9A-Za-z\x80-\xFF]+ procedure Filter_By_Count (Counter : in out Word_Counter; Threshold_Count : in String_Count); -- Remove from Counter all entries whose count is below the threshold function Simple_Dictionary (Counter : in Word_Counter; Word_Count : in Natural; Method : in Methods.Enum := Methods.Encoded) return String_Lists.List; -- Return the Word_Count words in Counter that have the highest score, -- the score being count * length. procedure Simple_Dictionary_And_Pending (Counter : in Word_Counter; Word_Count : in Natural; Selected : out String_Lists.List; Pending : out String_Lists.List; Method : in Methods.Enum := Methods.Encoded; Max_Pending_Count : in Ada.Containers.Count_Type := Ada.Containers.Count_Type'Last); -- Return in Selected the Word_Count words in Counter that have the -- highest score, and in Pending the remaining words, -- the score being count * length. type Score_Value is range 0 .. 2 ** 31 - 1; function Score_Encoded (Count : in String_Count; Length : in Positive) return Score_Value is (Score_Value (Count) * Score_Value (Length)); -- Score value using the amount of encoded data by the element function Score_Frequency (Count : in String_Count; Length : in Positive) return Score_Value is (Score_Value (Count)); -- Score value using the number of times the element was used function Score_Gain (Count : in String_Count; Length : in Positive) return Score_Value is (Score_Value (Count) * (Score_Value (Length) - 1)); -- Score value using the number of bytes saved using the element function Score (Count : in String_Count; Length : in Positive; Method : in Methods.Enum) return Score_Value is (case Method is when Methods.Encoded => Score_Encoded (Count, Length), when Methods.Frequency => Score_Frequency (Count, Length), when Methods.Gain => Score_Gain (Count, Length)); -- Scare value with dynamically chosen method private package Word_Maps is new Ada.Containers.Indefinite_Ordered_Maps (String, String_Count); type Word_Counter is record Map : Word_Maps.Map; end record; type Scored_Word (Size : Natural) is record Word : String (1 .. Size); Score : Score_Value; end record; function "<" (Left, Right : Scored_Word) return Boolean is (Left.Score > Right.Score or else (Left.Score = Right.Score and then Left.Word < Right.Word)); function To_Scored_Word (Cursor : in Word_Maps.Cursor; Method : in Methods.Enum) return Scored_Word; package Scored_Word_Sets is new Ada.Containers.Indefinite_Ordered_Sets (Scored_Word); end Natools.Smaz_Tools; |