Overview
Comment: | smaz-tools: add scoring method support in simple dictionary building |
---|---|
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA1: |
90573face28b62341b36945719e49984 |
User & Date: | nat on 2016-11-09 21:14:39 |
Other Links: | manifest | tags |
Context
2016-11-10
| ||
20:23 | tools/smaz: use current scoring method for initial dictionary building check-in: d09d9e46a9 user: nat tags: trunk | |
2016-11-09
| ||
21:14 | smaz-tools: add scoring method support in simple dictionary building check-in: 90573face2 user: nat tags: trunk | |
2016-11-08
| ||
20:44 | tools/smaz: use scoring from Natools.Smaz.Tools check-in: 99442da1d7 user: nat tags: trunk | |
Changes
Modified src/natools-smaz-tools.adb from [40f74c4b4e] to [06e0074235].
︙ | ︙ | |||
772 773 774 775 776 777 778 | pragma Assert (for all Count of Counter.Map => Count >= Threshold_Count); end Filter_By_Count; function Simple_Dictionary (Counter : in Word_Counter; | | > | > | | > > > > > > > > > | | 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 | pragma Assert (for all Count of Counter.Map => Count >= Threshold_Count); end Filter_By_Count; function Simple_Dictionary (Counter : in Word_Counter; Word_Count : in Natural; Method : in Methods.Enum := Methods.Encoded) return String_Lists.List is use type Ada.Containers.Count_Type; Target_Count : constant Ada.Containers.Count_Type := Ada.Containers.Count_Type (Word_Count); Set : Scored_Word_Sets.Set; Result : String_Lists.List; begin for Cursor in Word_Maps.Iterate (Counter.Map) loop Scored_Word_Sets.Include (Set, To_Scored_Word (Cursor, Method)); if Scored_Word_Sets.Length (Set) > Target_Count then Scored_Word_Sets.Delete_Last (Set); end if; end loop; for Cursor in Scored_Word_Sets.Iterate (Set) loop Result.Append (Scored_Word_Sets.Element (Cursor).Word); end loop; return Result; end Simple_Dictionary; procedure Simple_Dictionary_And_Pending (Counter : in Word_Counter; Word_Count : in Natural; Selected : out String_Lists.List; Pending : out String_Lists.List; Method : in Methods.Enum := Methods.Encoded; Max_Pending_Count : in Ada.Containers.Count_Type := Ada.Containers.Count_Type'Last) is use type Ada.Containers.Count_Type; Target_Count : constant Ada.Containers.Count_Type := Ada.Containers.Count_Type (Word_Count); Set : Scored_Word_Sets.Set; begin for Cursor in Word_Maps.Iterate (Counter.Map) loop Scored_Word_Sets.Insert (Set, To_Scored_Word (Cursor, Method)); end loop; Selected := String_Lists.Empty_List; Pending := String_Lists.Empty_List; for Cursor in Scored_Word_Sets.Iterate (Set) loop if String_Lists.Length (Selected) < Target_Count then Selected.Append (Scored_Word_Sets.Element (Cursor).Word); else Pending.Append (Scored_Word_Sets.Element (Cursor).Word); exit when String_Lists.Length (Selected) >= Max_Pending_Count; end if; end loop; end Simple_Dictionary_And_Pending; function To_Scored_Word (Cursor : in Word_Maps.Cursor; Method : in Methods.Enum) return Scored_Word is Word : constant String := Word_Maps.Key (Cursor); Factor : Score_Value; begin case Method is when Methods.Encoded => Factor := Word'Length; when Methods.Frequency => Factor := 1; when Methods.Gain => Factor := Word'Length - 1; end case; return Scored_Word' (Size => Word'Length, Word => Word, Score => Score_Value (Word_Maps.Element (Cursor)) * Factor); end To_Scored_Word; function Worst_Index (Dict : in Dictionary; Counts : in Dictionary_Counts; Method : in Methods.Enum) |
︙ | ︙ |
Modified src/natools-smaz-tools.ads from [716677a420] to [52958dca03].
︙ | ︙ | |||
154 155 156 157 158 159 160 | procedure Filter_By_Count (Counter : in out Word_Counter; Threshold_Count : in String_Count); -- Remove from Counter all entries whose count is below the threshold function Simple_Dictionary (Counter : in Word_Counter; | | > > | 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 | procedure Filter_By_Count (Counter : in out Word_Counter; Threshold_Count : in String_Count); -- Remove from Counter all entries whose count is below the threshold function Simple_Dictionary (Counter : in Word_Counter; Word_Count : in Natural; Method : in Methods.Enum := Methods.Encoded) return String_Lists.List; -- Return the Word_Count words in Counter that have the highest score, -- the score being count * length. procedure Simple_Dictionary_And_Pending (Counter : in Word_Counter; Word_Count : in Natural; Selected : out String_Lists.List; Pending : out String_Lists.List; Method : in Methods.Enum := Methods.Encoded; Max_Pending_Count : in Ada.Containers.Count_Type := Ada.Containers.Count_Type'Last); -- Return in Selected the Word_Count words in Counter that have the -- highest score, and in Pending the remaining words, -- the score being count * length. type Dictionary_Counts is |
︙ | ︙ | |||
259 260 261 262 263 264 265 | Score : Score_Value; end record; function "<" (Left, Right : Scored_Word) return Boolean is (Left.Score > Right.Score or else (Left.Score = Right.Score and then Left.Word < Right.Word)); | | > > | 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 | Score : Score_Value; end record; function "<" (Left, Right : Scored_Word) return Boolean is (Left.Score > Right.Score or else (Left.Score = Right.Score and then Left.Word < Right.Word)); function To_Scored_Word (Cursor : in Word_Maps.Cursor; Method : in Methods.Enum) return Scored_Word; package Scored_Word_Sets is new Ada.Containers.Indefinite_Ordered_Sets (Scored_Word); package Dictionary_Maps is new Ada.Containers.Indefinite_Ordered_Maps (String, Ada.Streams.Stream_Element); |
︙ | ︙ |