Natools

Check-in [90573face2]
Login
Overview
Comment:smaz-tools: add scoring method support in simple dictionary building
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 90573face28b62341b36945719e499843fe80e88
User & Date: nat on 2016-11-09 21:14:39
Other Links: manifest | tags
Context
2016-11-10
20:23
tools/smaz: use current scoring method for initial dictionary building check-in: d09d9e46a9 user: nat tags: trunk
2016-11-09
21:14
smaz-tools: add scoring method support in simple dictionary building check-in: 90573face2 user: nat tags: trunk
2016-11-08
20:44
tools/smaz: use scoring from Natools.Smaz.Tools check-in: 99442da1d7 user: nat tags: trunk
Changes

Modified src/natools-smaz-tools.adb from [40f74c4b4e] to [06e0074235].

772
773
774
775
776
777
778
779


780
781
782
783
784
785
786
787
788
789

790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808

809
810
811
812
813
814
815
816
817
818

819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835



836
837
838

839






840
841
842
843

844
845
846
847
848
849
850
772
773
774
775
776
777
778

779
780
781
782
783
784
785
786
787
788
789

790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819

820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836

837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853

854
855
856
857
858
859
860
861







-
+
+









-
+



















+









-
+
















-
+
+
+



+

+
+
+
+
+
+



-
+








      pragma Assert (for all Count of Counter.Map => Count >= Threshold_Count);
   end Filter_By_Count;


   function Simple_Dictionary
     (Counter : in Word_Counter;
      Word_Count : in Natural)
      Word_Count : in Natural;
      Method : in Methods.Enum := Methods.Encoded)
     return String_Lists.List
   is
      use type Ada.Containers.Count_Type;
      Target_Count : constant Ada.Containers.Count_Type
        := Ada.Containers.Count_Type (Word_Count);
      Set : Scored_Word_Sets.Set;
      Result : String_Lists.List;
   begin
      for Cursor in Word_Maps.Iterate (Counter.Map) loop
         Scored_Word_Sets.Include (Set, To_Scored_Word (Cursor));
         Scored_Word_Sets.Include (Set, To_Scored_Word (Cursor, Method));

         if Scored_Word_Sets.Length (Set) > Target_Count then
            Scored_Word_Sets.Delete_Last (Set);
         end if;
      end loop;

      for Cursor in Scored_Word_Sets.Iterate (Set) loop
         Result.Append (Scored_Word_Sets.Element (Cursor).Word);
      end loop;

      return Result;
   end Simple_Dictionary;


   procedure Simple_Dictionary_And_Pending
     (Counter : in Word_Counter;
      Word_Count : in Natural;
      Selected : out String_Lists.List;
      Pending : out String_Lists.List;
      Method : in Methods.Enum := Methods.Encoded;
      Max_Pending_Count : in Ada.Containers.Count_Type
        := Ada.Containers.Count_Type'Last)
   is
      use type Ada.Containers.Count_Type;
      Target_Count : constant Ada.Containers.Count_Type
        := Ada.Containers.Count_Type (Word_Count);
      Set : Scored_Word_Sets.Set;
   begin
      for Cursor in Word_Maps.Iterate (Counter.Map) loop
         Scored_Word_Sets.Insert (Set, To_Scored_Word (Cursor));
         Scored_Word_Sets.Insert (Set, To_Scored_Word (Cursor, Method));
      end loop;

      Selected := String_Lists.Empty_List;
      Pending := String_Lists.Empty_List;

      for Cursor in Scored_Word_Sets.Iterate (Set) loop
         if String_Lists.Length (Selected) < Target_Count then
            Selected.Append (Scored_Word_Sets.Element (Cursor).Word);
         else
            Pending.Append (Scored_Word_Sets.Element (Cursor).Word);
            exit when String_Lists.Length (Selected) >= Max_Pending_Count;
         end if;
      end loop;
   end Simple_Dictionary_And_Pending;


   function To_Scored_Word (Cursor : in Word_Maps.Cursor)
   function To_Scored_Word
     (Cursor : in Word_Maps.Cursor;
      Method : in Methods.Enum)
     return Scored_Word
   is
      Word : constant String := Word_Maps.Key (Cursor);
      Factor : Score_Value;
   begin
      case Method is
         when Methods.Encoded => Factor := Word'Length;
         when Methods.Frequency => Factor := 1;
         when Methods.Gain => Factor := Word'Length - 1;
      end case;

      return Scored_Word'
        (Size => Word'Length,
         Word => Word,
         Score => Score_Value (Word_Maps.Element (Cursor)) * Word'Length);
         Score => Score_Value (Word_Maps.Element (Cursor)) * Factor);
   end To_Scored_Word;


   function Worst_Index
     (Dict : in Dictionary;
      Counts : in Dictionary_Counts;
      Method : in Methods.Enum)

Modified src/natools-smaz-tools.ads from [716677a420] to [52958dca03].

154
155
156
157
158
159
160
161


162
163
164
165
166
167
168
169
170

171
172
173
174
175
176
177
154
155
156
157
158
159
160

161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179







-
+
+









+







   procedure Filter_By_Count
     (Counter : in out Word_Counter;
      Threshold_Count : in String_Count);
      --  Remove from Counter all entries whose count is below the threshold

   function Simple_Dictionary
     (Counter : in Word_Counter;
      Word_Count : in Natural)
      Word_Count : in Natural;
      Method : in Methods.Enum := Methods.Encoded)
     return String_Lists.List;
      --  Return the Word_Count words in Counter that have the highest score,
      --  the score being count * length.

   procedure Simple_Dictionary_And_Pending
     (Counter : in Word_Counter;
      Word_Count : in Natural;
      Selected : out String_Lists.List;
      Pending : out String_Lists.List;
      Method : in Methods.Enum := Methods.Encoded;
      Max_Pending_Count : in Ada.Containers.Count_Type
        := Ada.Containers.Count_Type'Last);
      --  Return in Selected the Word_Count words in Counter that have the
      --  highest score, and in Pending the remaining words,
      --  the score being count * length.

   type Dictionary_Counts is
259
260
261
262
263
264
265
266



267
268
269
270
271
272
273
261
262
263
264
265
266
267

268
269
270
271
272
273
274
275
276
277







-
+
+
+







      Score : Score_Value;
   end record;

   function "<" (Left, Right : Scored_Word) return Boolean
     is (Left.Score > Right.Score
         or else (Left.Score = Right.Score and then Left.Word < Right.Word));

   function To_Scored_Word (Cursor : in Word_Maps.Cursor)
   function To_Scored_Word
     (Cursor : in Word_Maps.Cursor;
      Method : in Methods.Enum)
     return Scored_Word;

   package Scored_Word_Sets is new Ada.Containers.Indefinite_Ordered_Sets
     (Scored_Word);

   package Dictionary_Maps is new Ada.Containers.Indefinite_Ordered_Maps
     (String, Ada.Streams.Stream_Element);