Natools

Check-in [cbe3489d15]
Login
Overview
Comment:tools/smaz: add a command-line option for optimized dictionary build
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: cbe3489d15af34f04e7332b6b4f8001dd5bb2319
User & Date: nat on 2016-10-27 21:58:33
Other Links: manifest | tags
Context
2016-10-28
20:56
tools/smaz: refactor scores out of the evaluation block check-in: fbe80ac184 user: nat tags: trunk
2016-10-27
21:58
tools/smaz: add a command-line option for optimized dictionary build check-in: cbe3489d15 user: nat tags: trunk
2016-10-26
20:58
tools/smaz: refactor dictionary evaluation in a standalone subprogram check-in: d418194c20 user: nat tags: trunk
Changes

Modified tools/smaz.adb from [ce2e2cd745] to [ac7f848c5d].

16
17
18
19
20
21
22

23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38



39
40
41
42
43
44
45
46
47
48
49
50

51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69

70
71
72
73
74
75
76

------------------------------------------------------------------------------
-- Command Line Interface for primitives in Natools.Smaz.Tools.             --
------------------------------------------------------------------------------

with Ada.Characters.Latin_1;
with Ada.Command_Line;

with Ada.Streams;
with Ada.Strings.Fixed;
with Ada.Strings.Unbounded;
with Ada.Text_IO.Text_Streams;
with Natools.Getopt_Long;
with Natools.Parallelism;
with Natools.S_Expressions.Parsers;
with Natools.S_Expressions.Printers;
with Natools.Smaz.Tools;
with Natools.Smaz.Tools.GNAT;
with Natools.String_Escapes;

procedure Smaz is
   function To_SEA (S : String) return Ada.Streams.Stream_Element_Array
     renames Natools.S_Expressions.To_Atom;




   package Actions is
      type Enum is
        (Nothing,
         Decode,
         Encode,
         Evaluate);
   end Actions;

   package Dict_Sources is
      type Enum is
        (S_Expression,
         Text_List);

   end Dict_Sources;

   package Options is
      type Id is
        (Output_Ada_Dict,
         Dictionary_Input,
         Decode,
         Encode,
         Evaluate,
         Filter_Threshold,
         Output_Hash,
         Job_Count,
         Help,
         Sx_Dict_Output,
         Min_Sub_Size,
         Max_Sub_Size,
         Stat_Output,
         No_Stat_Output,
         Text_List_Input,

         Max_Word_Size,
         Sx_Output,
         No_Sx_Output);
   end Options;

   package Getopt is new Natools.Getopt_Long (Options.Id);








>
















>
>
>











|
>



















>







16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82

------------------------------------------------------------------------------
-- Command Line Interface for primitives in Natools.Smaz.Tools.             --
------------------------------------------------------------------------------

with Ada.Characters.Latin_1;
with Ada.Command_Line;
with Ada.Containers.Indefinite_Holders;
with Ada.Streams;
with Ada.Strings.Fixed;
with Ada.Strings.Unbounded;
with Ada.Text_IO.Text_Streams;
with Natools.Getopt_Long;
with Natools.Parallelism;
with Natools.S_Expressions.Parsers;
with Natools.S_Expressions.Printers;
with Natools.Smaz.Tools;
with Natools.Smaz.Tools.GNAT;
with Natools.String_Escapes;

procedure Smaz is
   function To_SEA (S : String) return Ada.Streams.Stream_Element_Array
     renames Natools.S_Expressions.To_Atom;

   package Holders is new Ada.Containers.Indefinite_Holders
     (Natools.Smaz.Dictionary, Natools.Smaz."=");

   package Actions is
      type Enum is
        (Nothing,
         Decode,
         Encode,
         Evaluate);
   end Actions;

   package Dict_Sources is
      type Enum is
        (S_Expression,
         Text_List,
         Unoptimized_Text_List);
   end Dict_Sources;

   package Options is
      type Id is
        (Output_Ada_Dict,
         Dictionary_Input,
         Decode,
         Encode,
         Evaluate,
         Filter_Threshold,
         Output_Hash,
         Job_Count,
         Help,
         Sx_Dict_Output,
         Min_Sub_Size,
         Max_Sub_Size,
         Stat_Output,
         No_Stat_Output,
         Text_List_Input,
         Fast_Text_Input,
         Max_Word_Size,
         Sx_Output,
         No_Sx_Output);
   end Options;

   package Getopt is new Natools.Getopt_Long (Options.Id);

110
111
112
113
114
115
116




















117
118
119
120
121
122
123
      Counts : out Natools.Smaz.Tools.Dictionary_Counts);
      --  Dispatch to parallel or non-parallel version of Evaluate_Dictionary
      --  depending on Job_Count.

   function Getopt_Config return Getopt.Configuration;
      --  Build the configuration object





















   procedure Parallel_Evaluate_Dictionary
     (Job_Count : in Positive;
      Dict : in Natools.Smaz.Dictionary;
      Corpus : in Natools.Smaz.Tools.String_Lists.List;
      Compressed_Size : out Ada.Streams.Stream_Element_Count;
      Counts : out Natools.Smaz.Tools.Dictionary_Counts);
      --  Return the same results as Natools.Smaz.Tools.Evaluate_Dictionary,







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
      Counts : out Natools.Smaz.Tools.Dictionary_Counts);
      --  Dispatch to parallel or non-parallel version of Evaluate_Dictionary
      --  depending on Job_Count.

   function Getopt_Config return Getopt.Configuration;
      --  Build the configuration object

   procedure Optimization_Round
     (Dict : in out Holders.Holder;
      Score : in out Ada.Streams.Stream_Element_Count;
      Counts : in out Natools.Smaz.Tools.Dictionary_Counts;
      Pending_Words : in out Natools.Smaz.Tools.String_Lists.List;
      Input_Texts : in Natools.Smaz.Tools.String_Lists.List;
      Job_Count : in Natural;
      Updated : out Boolean);
      --  Try to improve on Dict by replacing a single entry from it with
      --  one of the substring in Pending_Words.

   function Optimize_Dictionary
     (Base : in Natools.Smaz.Dictionary;
      Pending_Words : in Natools.Smaz.Tools.String_Lists.List;
      Input_Texts : in Natools.Smaz.Tools.String_Lists.List;
      Job_Count : in Natural)
     return Natools.Smaz.Dictionary;
      --  Optimize the dictionary on Input_Texts, starting with Base and
      --  adding substrings from Pending_Words.

   procedure Parallel_Evaluate_Dictionary
     (Job_Count : in Positive;
      Dict : in Natools.Smaz.Dictionary;
      Corpus : in Natools.Smaz.Tools.String_Lists.List;
      Compressed_Size : out Ada.Streams.Stream_Element_Count;
      Counts : out Natools.Smaz.Tools.Dictionary_Counts);
      --  Return the same results as Natools.Smaz.Tools.Evaluate_Dictionary,
196
197
198
199
200
201
202



203
204
205
206
207
208
209

         when Options.Dictionary_Input =>
            Handler.Dict_Source := Dict_Sources.S_Expression;

         when Options.Text_List_Input =>
            Handler.Dict_Source := Dict_Sources.Text_List;




         when Options.Sx_Dict_Output =>
            Handler.Need_Dictionary := True;
            Handler.Sx_Dict_Output := True;

         when Options.Min_Sub_Size =>
            Handler.Min_Sub_Size := Positive'Value (Argument);








>
>
>







222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238

         when Options.Dictionary_Input =>
            Handler.Dict_Source := Dict_Sources.S_Expression;

         when Options.Text_List_Input =>
            Handler.Dict_Source := Dict_Sources.Text_List;

         when Options.Fast_Text_Input =>
            Handler.Dict_Source := Dict_Sources.Unoptimized_Text_List;

         when Options.Sx_Dict_Output =>
            Handler.Need_Dictionary := True;
            Handler.Sx_Dict_Output := True;

         when Options.Min_Sub_Size =>
            Handler.Min_Sub_Size := Positive'Value (Argument);

275
276
277
278
279
280
281

282
283
284
285
286
287
288


















































289
290
291
292
293
294
295
      R.Add_Option ("jobs",          'j', Required_Argument, Job_Count);
      R.Add_Option ("sx-dict",       'L', No_Argument,       Sx_Dict_Output);
      R.Add_Option ("min-substring", 'm', Required_Argument, Min_Sub_Size);
      R.Add_Option ("max-substring", 'M', Required_Argument, Max_Sub_Size);
      R.Add_Option ("stats",         's', No_Argument,       Stat_Output);
      R.Add_Option ("no-stats",      'S', No_Argument,       No_Stat_Output);
      R.Add_Option ("text-list",     't', No_Argument,       Text_List_Input);

      R.Add_Option ("max-word-len",  'W', Required_Argument, Max_Word_Size);
      R.Add_Option ("s-expr",        'x', No_Argument,       Sx_Output);
      R.Add_Option ("no-s-expr",     'X', No_Argument,       No_Sx_Output);

      return R;
   end Getopt_Config;




















































   procedure Parallel_Evaluate_Dictionary
     (Job_Count : in Positive;
      Dict : in Natools.Smaz.Dictionary;
      Corpus : in Natools.Smaz.Tools.String_Lists.List;
      Compressed_Size : out Ada.Streams.Stream_Element_Count;
      Counts : out Natools.Smaz.Tools.Dictionary_Counts)







>







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
      R.Add_Option ("jobs",          'j', Required_Argument, Job_Count);
      R.Add_Option ("sx-dict",       'L', No_Argument,       Sx_Dict_Output);
      R.Add_Option ("min-substring", 'm', Required_Argument, Min_Sub_Size);
      R.Add_Option ("max-substring", 'M', Required_Argument, Max_Sub_Size);
      R.Add_Option ("stats",         's', No_Argument,       Stat_Output);
      R.Add_Option ("no-stats",      'S', No_Argument,       No_Stat_Output);
      R.Add_Option ("text-list",     't', No_Argument,       Text_List_Input);
      R.Add_Option ("fast-text-list", 'T', No_Argument,       Fast_Text_Input);
      R.Add_Option ("max-word-len",  'W', Required_Argument, Max_Word_Size);
      R.Add_Option ("s-expr",        'x', No_Argument,       Sx_Output);
      R.Add_Option ("no-s-expr",     'X', No_Argument,       No_Sx_Output);

      return R;
   end Getopt_Config;


   procedure Optimization_Round
     (Dict : in out Holders.Holder;
      Score : in out Ada.Streams.Stream_Element_Count;
      Counts : in out Natools.Smaz.Tools.Dictionary_Counts;
      Pending_Words : in out Natools.Smaz.Tools.String_Lists.List;
      Input_Texts : in Natools.Smaz.Tools.String_Lists.List;
      Job_Count : in Natural;
      Updated : out Boolean)
   is
      pragma Unreferenced (Dict);
      pragma Unreferenced (Score);
      pragma Unreferenced (Counts);
      pragma Unreferenced (Pending_Words);
      pragma Unreferenced (Input_Texts);
      pragma Unreferenced (Job_Count);
   begin
      Updated := False;
   end Optimization_Round;


   function Optimize_Dictionary
     (Base : in Natools.Smaz.Dictionary;
      Pending_Words : in Natools.Smaz.Tools.String_Lists.List;
      Input_Texts : in Natools.Smaz.Tools.String_Lists.List;
      Job_Count : in Natural)
     return Natools.Smaz.Dictionary
   is
      Holder : Holders.Holder := Holders.To_Holder (Base);
      Pending : Natools.Smaz.Tools.String_Lists.List := Pending_Words;
      Score : Ada.Streams.Stream_Element_Count;
      Counts : Natools.Smaz.Tools.Dictionary_Counts;
      Running : Boolean := True;
   begin
      Evaluate_Dictionary (Job_Count, Base, Input_Texts, Score, Counts);

      while Running loop
         Optimization_Round
           (Holder,
            Score,
            Counts,
            Pending,
            Input_Texts,
            Job_Count,
            Running);
      end loop;

      return Holder.Element;
   end Optimize_Dictionary;


   procedure Parallel_Evaluate_Dictionary
     (Job_Count : in Positive;
      Dict : in Natools.Smaz.Dictionary;
      Corpus : in Natools.Smaz.Tools.String_Lists.List;
      Compressed_Size : out Ada.Streams.Stream_Element_Count;
      Counts : out Natools.Smaz.Tools.Dictionary_Counts)
491
492
493
494
495
496
497






498
499
500
501
502
503
504

            when Options.Text_List_Input =>
               New_Line (Output);
               Put_Line (Output, Indent & Indent
                 & "Compute dictionary from sample texts"
                 & " in input S-expression");







            when Options.Sx_Dict_Output =>
               New_Line (Output);
               Put_Line (Output, Indent & Indent
                 & "Output the dictionary as a S-expression");

            when Options.Min_Sub_Size =>
               New_Line (Output);







>
>
>
>
>
>







571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590

            when Options.Text_List_Input =>
               New_Line (Output);
               Put_Line (Output, Indent & Indent
                 & "Compute dictionary from sample texts"
                 & " in input S-expression");

            when Options.Fast_Text_Input =>
               New_Line (Output);
               Put_Line (Output, Indent & Indent
                 & "Compute dictionary from sample texts"
                 & " in input S-expression, without optimization");

            when Options.Sx_Dict_Output =>
               New_Line (Output);
               Put_Line (Output, Indent & Indent
                 & "Output the dictionary as a S-expression");

            when Options.Min_Sub_Size =>
               New_Line (Output);
530
531
532
533
534
535
536

537
538
539
540
541
542
543

544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568














569
570
571

572
573
574
575
576
577
578
               Put_Line (Output, Indent & Indent
                 & "Before building a dictionary from substrings, remove");
               Put_Line (Output, Indent & Indent
                 & "substrings whose count is below the threshold.");
         end case;
      end loop;
   end Print_Help;


   function To_Dictionary
     (Handler : in Callback'Class;
      Input : in Natools.Smaz.Tools.String_Lists.List)
     return Natools.Smaz.Dictionary
   is
      use type Natools.Smaz.Tools.String_Count;

   begin
      case Handler.Dict_Source is
         when Dict_Sources.S_Expression =>
            return Natools.Smaz.Tools.To_Dictionary (Input, True);

         when Dict_Sources.Text_List =>
            declare
               Counter : Natools.Smaz.Tools.Word_Counter;
            begin
               for S of Input loop
                  Natools.Smaz.Tools.Add_Substrings
                    (Counter, S, Handler.Min_Sub_Size, Handler.Max_Sub_Size);

                  if Handler.Max_Word_Size > Handler.Max_Sub_Size then
                     Natools.Smaz.Tools.Add_Words
                       (Counter, S,
                        Handler.Max_Sub_Size + 1, Handler.Max_Word_Size);
                  end if;
               end loop;

               if Handler.Filter_Threshold > 0 then
                  Natools.Smaz.Tools.Filter_By_Count
                    (Counter, Handler.Filter_Threshold);
               end if;















               return Natools.Smaz.Tools.To_Dictionary
                 (Natools.Smaz.Tools.Simple_Dictionary (Counter, 254),
                  True);

            end;
      end case;
   end To_Dictionary;

   Opt_Config : constant Getopt.Configuration := Getopt_Config;
   Handler : Callback;
   Input_List, Input_Data : Natools.Smaz.Tools.String_Lists.List;







>







>





|



















>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
|
|
>







616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
               Put_Line (Output, Indent & Indent
                 & "Before building a dictionary from substrings, remove");
               Put_Line (Output, Indent & Indent
                 & "substrings whose count is below the threshold.");
         end case;
      end loop;
   end Print_Help;


   function To_Dictionary
     (Handler : in Callback'Class;
      Input : in Natools.Smaz.Tools.String_Lists.List)
     return Natools.Smaz.Dictionary
   is
      use type Natools.Smaz.Tools.String_Count;
      use type Dict_Sources.Enum;
   begin
      case Handler.Dict_Source is
         when Dict_Sources.S_Expression =>
            return Natools.Smaz.Tools.To_Dictionary (Input, True);

         when Dict_Sources.Text_List | Dict_Sources.Unoptimized_Text_List =>
            declare
               Counter : Natools.Smaz.Tools.Word_Counter;
            begin
               for S of Input loop
                  Natools.Smaz.Tools.Add_Substrings
                    (Counter, S, Handler.Min_Sub_Size, Handler.Max_Sub_Size);

                  if Handler.Max_Word_Size > Handler.Max_Sub_Size then
                     Natools.Smaz.Tools.Add_Words
                       (Counter, S,
                        Handler.Max_Sub_Size + 1, Handler.Max_Word_Size);
                  end if;
               end loop;

               if Handler.Filter_Threshold > 0 then
                  Natools.Smaz.Tools.Filter_By_Count
                    (Counter, Handler.Filter_Threshold);
               end if;

               if Handler.Dict_Source = Dict_Sources.Text_List then
                  declare
                     Selected, Pending : Natools.Smaz.Tools.String_Lists.List;
                  begin
                     Natools.Smaz.Tools.Simple_Dictionary_And_Pending
                       (Counter, 254, Selected, Pending);

                     return Optimize_Dictionary
                       (Natools.Smaz.Tools.To_Dictionary (Selected, True),
                        Pending,
                        Input,
                        Handler.Job_Count);
                  end;
               else
                  return Natools.Smaz.Tools.To_Dictionary
                    (Natools.Smaz.Tools.Simple_Dictionary (Counter, 254),
                     True);
               end if;
            end;
      end case;
   end To_Dictionary;

   Opt_Config : constant Getopt.Configuration := Getopt_Config;
   Handler : Callback;
   Input_List, Input_Data : Natools.Smaz.Tools.String_Lists.List;