Natools

Check-in [ecae1e85f4]
Login
Overview
Comment:tools/smaz: add command-line options for min and max dictionary size
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: ecae1e85f47d03ea91382dbd78af2fd78a7d100f
User & Date: nat on 2017-05-19 21:04:57
Other Links: manifest | tags
Context
2017-05-20
19:25
tools/smaz: move log-message construction of Optimization_Round check-in: 032d847343 user: nat tags: trunk
2017-05-19
21:04
tools/smaz: add command-line options for min and max dictionary size check-in: ecae1e85f4 user: nat tags: trunk
2017-05-18
21:13
tools/smaz: implement forced words for optimized dictionary generation check-in: 014ca1d01b user: nat tags: trunk
Changes

Modified tools/smaz.adb from [4b4875a3ad] to [309a261406].

101
102
103
104
105
106
107


108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125

126
127
128
129
130
131
132
         No_Stat_Output,
         Text_List_Input,
         Fast_Text_Input,
         Max_Word_Size,
         Sx_Output,
         No_Sx_Output,
         Force_Word,


         No_Vlen_Verbatim,
         Score_Method,
         Vlen_Verbatim);
   end Options;

   package Getopt is new Natools.Getopt_Long (Options.Id);

   type Callback is new Getopt.Handlers.Callback with record
      Algorithm : Algorithms.Enum := Algorithms.Base_256;
      Display_Help : Boolean := False;
      Need_Dictionary : Boolean := False;
      Stat_Output : Boolean := False;
      Sx_Output : Boolean := False;
      Sx_Dict_Output : Boolean := False;
      Min_Sub_Size : Positive := 1;
      Max_Sub_Size : Positive := 3;
      Max_Word_Size : Positive := 10;
      Dict_Size : Positive := 254;

      Vlen_Verbatim : Boolean := True;
      Max_Pending : Ada.Containers.Count_Type
        := Ada.Containers.Count_Type'Last;
      Job_Count : Natural := 0;
      Filter_Threshold : Natools.Smaz_Tools.String_Count := 0;
      Score_Method : Methods.Enum := Methods.Encoded;
      Action : Actions.Enum := Actions.Nothing;







>
>

















|
>







101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
         No_Stat_Output,
         Text_List_Input,
         Fast_Text_Input,
         Max_Word_Size,
         Sx_Output,
         No_Sx_Output,
         Force_Word,
         Max_Dict_Size,
         Min_Dict_Size,
         No_Vlen_Verbatim,
         Score_Method,
         Vlen_Verbatim);
   end Options;

   package Getopt is new Natools.Getopt_Long (Options.Id);

   type Callback is new Getopt.Handlers.Callback with record
      Algorithm : Algorithms.Enum := Algorithms.Base_256;
      Display_Help : Boolean := False;
      Need_Dictionary : Boolean := False;
      Stat_Output : Boolean := False;
      Sx_Output : Boolean := False;
      Sx_Dict_Output : Boolean := False;
      Min_Sub_Size : Positive := 1;
      Max_Sub_Size : Positive := 3;
      Max_Word_Size : Positive := 10;
      Max_Dict_Size : Positive := 254;
      Min_Dict_Size : Positive := 254;
      Vlen_Verbatim : Boolean := True;
      Max_Pending : Ada.Containers.Count_Type
        := Ada.Containers.Count_Type'Last;
      Job_Count : Natural := 0;
      Filter_Threshold : Natools.Smaz_Tools.String_Count := 0;
      Score_Method : Methods.Enum := Methods.Encoded;
      Action : Actions.Enum := Actions.Nothing;
370
371
372
373
374
375
376


377
378
379
380
381
382
383
384
385
386
387


388
389
390
391
392
393
394
         Score : in out Ada.Streams.Stream_Element_Count;
         Counts : in out Dictionary_Counts;
         First : in Dictionary_Entry;
         Pending_Words : in out String_Lists.List;
         Input_Texts : in String_Lists.List;
         Job_Count : in Natural;
         Method : in Methods;


         Updated : out Boolean);
      --  Try to improve on Dict by replacing a single entry from it with
      --  one of the substring in Pending_Words.

      function Optimize_Dictionary
        (Base : in Dictionary;
         First : in Dictionary_Entry;
         Pending_Words : in String_Lists.List;
         Input_Texts : in String_Lists.List;
         Job_Count : in Natural;
         Method : in Methods)


        return Dictionary;
      --  Optimize the dictionary on Input_Texts, starting with Base and
      --  adding substrings from Pending_Words. Operates only on words
      --  at First and beyond.

      procedure Parallel_Evaluate_Dictionary
        (Job_Count : in Positive;







>
>










|
>
>







373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
         Score : in out Ada.Streams.Stream_Element_Count;
         Counts : in out Dictionary_Counts;
         First : in Dictionary_Entry;
         Pending_Words : in out String_Lists.List;
         Input_Texts : in String_Lists.List;
         Job_Count : in Natural;
         Method : in Methods;
         Min_Dict_Size : in Positive;
         Max_Dict_Size : in Positive;
         Updated : out Boolean);
      --  Try to improve on Dict by replacing a single entry from it with
      --  one of the substring in Pending_Words.

      function Optimize_Dictionary
        (Base : in Dictionary;
         First : in Dictionary_Entry;
         Pending_Words : in String_Lists.List;
         Input_Texts : in String_Lists.List;
         Job_Count : in Natural;
         Method : in Methods;
         Min_Dict_Size : in Positive;
         Max_Dict_Size : in Positive)
        return Dictionary;
      --  Optimize the dictionary on Input_Texts, starting with Base and
      --  adding substrings from Pending_Words. Operates only on words
      --  at First and beyond.

      procedure Parallel_Evaluate_Dictionary
        (Job_Count : in Positive;
560
561
562
563
564
565
566


567
568


569
570
571
572
573
574
575
         Score : in out Ada.Streams.Stream_Element_Count;
         Counts : in out Dictionary_Counts;
         First : in Dictionary_Entry;
         Pending_Words : in out String_Lists.List;
         Input_Texts : in String_Lists.List;
         Job_Count : in Natural;
         Method : in Methods;


         Updated : out Boolean)
      is


         use type Ada.Streams.Stream_Element_Offset;

         New_Value : Ada.Strings.Unbounded.Unbounded_String;
         New_Position : String_Lists.Cursor;
         Worst_Index : constant Dictionary_Entry
           := Worst_Element
              (Dict.Element, Counts, Method, First, Last_Code (Dict.Element));







>
>


>
>







567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
         Score : in out Ada.Streams.Stream_Element_Count;
         Counts : in out Dictionary_Counts;
         First : in Dictionary_Entry;
         Pending_Words : in out String_Lists.List;
         Input_Texts : in String_Lists.List;
         Job_Count : in Natural;
         Method : in Methods;
         Min_Dict_Size : in Positive;
         Max_Dict_Size : in Positive;
         Updated : out Boolean)
      is
         pragma Unreferenced (Min_Dict_Size);
         pragma Unreferenced (Max_Dict_Size);
         use type Ada.Streams.Stream_Element_Offset;

         New_Value : Ada.Strings.Unbounded.Unbounded_String;
         New_Position : String_Lists.Cursor;
         Worst_Index : constant Dictionary_Entry
           := Worst_Element
              (Dict.Element, Counts, Method, First, Last_Code (Dict.Element));
628
629
630
631
632
633
634
635


636
637
638
639
640
641
642

      function Optimize_Dictionary
        (Base : in Dictionary;
         First : in Dictionary_Entry;
         Pending_Words : in String_Lists.List;
         Input_Texts : in String_Lists.List;
         Job_Count : in Natural;
         Method : in Methods)


        return Dictionary
      is
         Holder : Holders.Holder := Holders.To_Holder (Base);
         Pending : String_Lists.List := Pending_Words;
         Score : Ada.Streams.Stream_Element_Count;
         Counts : Dictionary_Counts;
         Running : Boolean := True;







|
>
>







639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655

      function Optimize_Dictionary
        (Base : in Dictionary;
         First : in Dictionary_Entry;
         Pending_Words : in String_Lists.List;
         Input_Texts : in String_Lists.List;
         Job_Count : in Natural;
         Method : in Methods;
         Min_Dict_Size : in Positive;
         Max_Dict_Size : in Positive)
        return Dictionary
      is
         Holder : Holders.Holder := Holders.To_Holder (Base);
         Pending : String_Lists.List := Pending_Words;
         Score : Ada.Streams.Stream_Element_Count;
         Counts : Dictionary_Counts;
         Running : Boolean := True;
650
651
652
653
654
655
656


657
658
659
660
661
662
663
               Score,
               Counts,
               First,
               Pending,
               Input_Texts,
               Job_Count,
               Method,


               Running);
         end loop;

         return Holder.Element;
      end Optimize_Dictionary;









>
>







663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
               Score,
               Counts,
               First,
               Pending,
               Input_Texts,
               Job_Count,
               Method,
               Min_Dict_Size,
               Max_Dict_Size,
               Running);
         end loop;

         return Holder.Element;
      end Optimize_Dictionary;


1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114

1115
1116
1117
1118
1119
1120
1121
                  To_Dictionary (Input, Handler.Vlen_Verbatim),
                  Data_List,
                  Method);

            when Dict_Sources.Text_List =>
               declare
                  Needed : constant Integer
                    := Handler.Dict_Size
                     - Natural (Handler.Forced_Words.Length);
                  Selected, Pending : String_Lists.List;
                  First : Dictionary_Entry := Dictionary_Entry'First;
               begin
                  if Needed <= 0 then
                     for Word of reverse Handler.Forced_Words loop
                        Selected.Prepend (Word);
                        if Positive (Selected.Length) = Handler.Dict_Size then
                           return To_Dictionary
                             (Selected, Handler.Vlen_Verbatim);
                        end if;
                     end loop;

                  end if;

                  Simple_Dictionary_And_Pending
                    (Make_Word_Counter (Handler, Input),
                     Needed,
                     Selected,
                     Pending,







|







|
<
|
<

>







1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125

1126

1127
1128
1129
1130
1131
1132
1133
1134
1135
                  To_Dictionary (Input, Handler.Vlen_Verbatim),
                  Data_List,
                  Method);

            when Dict_Sources.Text_List =>
               declare
                  Needed : constant Integer
                    := Handler.Max_Dict_Size
                     - Natural (Handler.Forced_Words.Length);
                  Selected, Pending : String_Lists.List;
                  First : Dictionary_Entry := Dictionary_Entry'First;
               begin
                  if Needed <= 0 then
                     for Word of reverse Handler.Forced_Words loop
                        Selected.Prepend (Word);
                        exit when Positive (Selected.Length)

                          = Handler.Max_Dict_Size;

                     end loop;
                     return To_Dictionary (Selected, Handler.Vlen_Verbatim);
                  end if;

                  Simple_Dictionary_And_Pending
                    (Make_Word_Counter (Handler, Input),
                     Needed,
                     Selected,
                     Pending,
1129
1130
1131
1132
1133
1134
1135
1136


1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164

                  return Optimize_Dictionary
                    (To_Dictionary (Selected, Handler.Vlen_Verbatim),
                     First,
                     Pending,
                     Input,
                     Handler.Job_Count,
                     Method);


               end;

            when Dict_Sources.Unoptimized_Text_List =>
               declare
                  Needed : constant Integer
                    := Handler.Dict_Size
                     - Natural (Handler.Forced_Words.Length);
                  All_Words : String_Lists.List;
               begin
                  if Needed > 0 then
                     All_Words := Simple_Dictionary
                       (Make_Word_Counter (Handler, Input), Needed, Method);

                     for Word of reverse Handler.Forced_Words loop
                        All_Words.Prepend (Word);
                     end loop;
                  else
                     for Word of reverse Handler.Forced_Words loop
                        All_Words.Prepend (Word);
                        exit when Positive (All_Words.Length)
                          >= Handler.Dict_Size;
                     end loop;
                  end if;

                  return To_Dictionary (All_Words, Handler.Vlen_Verbatim);
               end;
         end case;
      end To_Dictionary;







|
>
>





|














|







1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180

                  return Optimize_Dictionary
                    (To_Dictionary (Selected, Handler.Vlen_Verbatim),
                     First,
                     Pending,
                     Input,
                     Handler.Job_Count,
                     Method,
                     Handler.Min_Dict_Size,
                     Handler.Max_Dict_Size);
               end;

            when Dict_Sources.Unoptimized_Text_List =>
               declare
                  Needed : constant Integer
                    := Handler.Max_Dict_Size
                     - Natural (Handler.Forced_Words.Length);
                  All_Words : String_Lists.List;
               begin
                  if Needed > 0 then
                     All_Words := Simple_Dictionary
                       (Make_Word_Counter (Handler, Input), Needed, Method);

                     for Word of reverse Handler.Forced_Words loop
                        All_Words.Prepend (Word);
                     end loop;
                  else
                     for Word of reverse Handler.Forced_Words loop
                        All_Words.Prepend (Word);
                        exit when Positive (All_Words.Length)
                          >= Handler.Max_Dict_Size;
                     end loop;
                  end if;

                  return To_Dictionary (All_Words, Handler.Vlen_Verbatim);
               end;
         end case;
      end To_Dictionary;
1377
1378
1379
1380
1381
1382
1383
1384

1385
1386
1387
1388
1389
1390
1391
         when Options.Score_Method =>
            Handler.Score_Method := Methods.Enum'Value (Argument);

         when Options.Max_Pending =>
            Handler.Max_Pending := Ada.Containers.Count_Type'Value (Argument);

         when Options.Dict_Size =>
            Handler.Dict_Size := Positive'Value (Argument);


         when Options.Vlen_Verbatim =>
            Handler.Vlen_Verbatim := True;

         when Options.No_Vlen_Verbatim =>
            Handler.Vlen_Verbatim := False;








|
>







1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
         when Options.Score_Method =>
            Handler.Score_Method := Methods.Enum'Value (Argument);

         when Options.Max_Pending =>
            Handler.Max_Pending := Ada.Containers.Count_Type'Value (Argument);

         when Options.Dict_Size =>
            Handler.Min_Dict_Size := Positive'Value (Argument);
            Handler.Max_Dict_Size := Positive'Value (Argument);

         when Options.Vlen_Verbatim =>
            Handler.Vlen_Verbatim := True;

         when Options.No_Vlen_Verbatim =>
            Handler.Vlen_Verbatim := False;

1409
1410
1411
1412
1413
1414
1415






1416
1417
1418
1419
1420
1421
1422
               Handler.Need_Dictionary := True;
               Handler.Forced_Words.Append (Argument);

               if Handler.Action in Actions.Nothing then
                  Handler.Action := Actions.Adjust_Dictionary;
               end if;
            end if;






      end case;
   end Option;


   function Activate_Dictionary (Dict : in Natools.Smaz_256.Dictionary)
     return Natools.Smaz_256.Dictionary
   is







>
>
>
>
>
>







1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
               Handler.Need_Dictionary := True;
               Handler.Forced_Words.Append (Argument);

               if Handler.Action in Actions.Nothing then
                  Handler.Action := Actions.Adjust_Dictionary;
               end if;
            end if;

         when Options.Max_Dict_Size =>
            Handler.Max_Dict_Size := Positive'Value (Argument);

         when Options.Min_Dict_Size =>
            Handler.Min_Dict_Size := Positive'Value (Argument);
      end case;
   end Option;


   function Activate_Dictionary (Dict : in Natools.Smaz_256.Dictionary)
     return Natools.Smaz_256.Dictionary
   is
1543
1544
1545
1546
1547
1548
1549


1550
1551
1552
1553
1554
1555
1556
      R.Add_Option ("no-stats",      'S', No_Argument,       No_Stat_Output);
      R.Add_Option ("text-list",     't', No_Argument,       Text_List_Input);
      R.Add_Option ("fast-text-list", 'T', No_Argument,       Fast_Text_Input);
      R.Add_Option ("max-word-len",  'W', Required_Argument, Max_Word_Size);
      R.Add_Option ("s-expr",        'x', No_Argument,       Sx_Output);
      R.Add_Option ("no-s-expr",     'X', No_Argument,       No_Sx_Output);
      R.Add_Option ("force-word",         Required_Argument, Force_Word);


      R.Add_Option ("no-vlen-verbatim",   No_Argument,       No_Vlen_Verbatim);
      R.Add_Option ("score-method",       Required_Argument, Score_Method);
      R.Add_Option ("vlen-verbatim",      No_Argument,       Vlen_Verbatim);

      return R;
   end Getopt_Config;








>
>







1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
      R.Add_Option ("no-stats",      'S', No_Argument,       No_Stat_Output);
      R.Add_Option ("text-list",     't', No_Argument,       Text_List_Input);
      R.Add_Option ("fast-text-list", 'T', No_Argument,       Fast_Text_Input);
      R.Add_Option ("max-word-len",  'W', Required_Argument, Max_Word_Size);
      R.Add_Option ("s-expr",        'x', No_Argument,       Sx_Output);
      R.Add_Option ("no-s-expr",     'X', No_Argument,       No_Sx_Output);
      R.Add_Option ("force-word",         Required_Argument, Force_Word);
      R.Add_Option ("max-dict-size",      Required_Argument, Max_Dict_Size);
      R.Add_Option ("min-dict-size",      Required_Argument, Min_Dict_Size);
      R.Add_Option ("no-vlen-verbatim",   No_Argument,       No_Vlen_Verbatim);
      R.Add_Option ("score-method",       Required_Argument, Score_Method);
      R.Add_Option ("vlen-verbatim",      No_Argument,       Vlen_Verbatim);

      return R;
   end Getopt_Config;

1826
1827
1828
1829
1830
1831
1832










1833
1834
1835
1836
1837
1838
1839
            when Options.Force_Word =>
               Put_Line (Output, " <word>");
               Put_Line (Output, Indent & Indent
                 & "Force <word> into the dictionary,"
                 & " replacing the worst entry");
               Put_Line (Output, Indent & Indent
                 & "Can be specified multiple times to force many words.");










         end case;
      end loop;
   end Print_Help;


   Opt_Config : constant Getopt.Configuration := Getopt_Config;
   Handler : Callback;







>
>
>
>
>
>
>
>
>
>







1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
            when Options.Force_Word =>
               Put_Line (Output, " <word>");
               Put_Line (Output, Indent & Indent
                 & "Force <word> into the dictionary,"
                 & " replacing the worst entry");
               Put_Line (Output, Indent & Indent
                 & "Can be specified multiple times to force many words.");

            when Options.Max_Dict_Size =>
               Put_Line (Output, " <count>");
               Put_Line (Output, Indent & Indent
                 & "Maximum number of words in the dictionary to build");

            when Options.Min_Dict_Size =>
               Put_Line (Output, " <count>");
               Put_Line (Output, Indent & Indent
                 & "Minimum number of words in the dictionary to build");
         end case;
      end loop;
   end Print_Help;


   Opt_Config : constant Getopt.Configuration := Getopt_Config;
   Handler : Callback;