Natools

Check-in [adcca90a65]
Login
Overview
Comment:tools/smaz: new command-line option to enable parallel dictionary eval
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: adcca90a65dc20606c6d5e9323065bbb069b38b3
User & Date: nat on 2016-10-12 17:50:33
Other Links: manifest | tags
Context
2016-10-13
21:47
parallelism: add a new framework with a task-local accumulator check-in: db5f3c6f57 user: nat tags: trunk
2016-10-12
17:50
tools/smaz: new command-line option to enable parallel dictionary eval check-in: adcca90a65 user: nat tags: trunk
2016-10-11
15:37
parallelism: new package providing framework for simple parallelizations check-in: a45910b245 user: nat tags: trunk
Changes

Modified tools/smaz.adb from [80ca13ccbf] to [b920b70a9d].

21
22
23
24
25
26
27

28
29
30
31
32
33
34
with Ada.Characters.Latin_1;
with Ada.Command_Line;
with Ada.Streams;
with Ada.Strings.Fixed;
with Ada.Strings.Unbounded;
with Ada.Text_IO.Text_Streams;
with Natools.Getopt_Long;

with Natools.S_Expressions.Parsers;
with Natools.S_Expressions.Printers;
with Natools.Smaz.Tools;
with Natools.Smaz.Tools.GNAT;
with Natools.String_Escapes;

procedure Smaz is







>







21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
with Ada.Characters.Latin_1;
with Ada.Command_Line;
with Ada.Streams;
with Ada.Strings.Fixed;
with Ada.Strings.Unbounded;
with Ada.Text_IO.Text_Streams;
with Natools.Getopt_Long;
with Natools.Parallelism;
with Natools.S_Expressions.Parsers;
with Natools.S_Expressions.Printers;
with Natools.Smaz.Tools;
with Natools.Smaz.Tools.GNAT;
with Natools.String_Escapes;

procedure Smaz is
53
54
55
56
57
58
59

60
61
62
63
64
65
66
      type Id is
        (Output_Ada_Dict,
         Dictionary_Input,
         Decode,
         Encode,
         Evaluate,
         Output_Hash,

         Help,
         Sx_Dict_Output,
         Min_Sub_Size,
         Max_Sub_Size,
         Stat_Output,
         No_Stat_Output,
         Word_List_Input,







>







54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
      type Id is
        (Output_Ada_Dict,
         Dictionary_Input,
         Decode,
         Encode,
         Evaluate,
         Output_Hash,
         Job_Count,
         Help,
         Sx_Dict_Output,
         Min_Sub_Size,
         Max_Sub_Size,
         Stat_Output,
         No_Stat_Output,
         Word_List_Input,
76
77
78
79
80
81
82

83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102









103
104
105
106
107
108
109
      Need_Dictionary : Boolean := False;
      Stat_Output : Boolean := False;
      Sx_Output : Boolean := False;
      Sx_Dict_Output : Boolean := False;
      Min_Sub_Size : Positive := 1;
      Max_Sub_Size : Positive := 3;
      Max_Word_Size : Positive := 10;

      Action : Actions.Enum := Actions.Nothing;
      Ada_Dictionary : Ada.Strings.Unbounded.Unbounded_String;
      Hash_Package : Ada.Strings.Unbounded.Unbounded_String;
      Dict_Source : Dict_Sources.Enum := Dict_Sources.S_Expression;
   end record;

   overriding procedure Option
     (Handler  : in out Callback;
      Id       : in Options.Id;
      Argument : in String);

   overriding procedure Argument
     (Handler  : in out Callback;
      Argument : in String)
     is null;


   function Getopt_Config return Getopt.Configuration;
      --  Build the configuration object










   procedure Print_Dictionary
     (Filename : in String;
      Dictionary : in Natools.Smaz.Dictionary;
      Hash_Package_Name : in String := "");
   procedure Print_Dictionary
     (Output : in Ada.Text_IO.File_Type;
      Dictionary : in Natools.Smaz.Dictionary;







>




















>
>
>
>
>
>
>
>
>







78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
      Need_Dictionary : Boolean := False;
      Stat_Output : Boolean := False;
      Sx_Output : Boolean := False;
      Sx_Dict_Output : Boolean := False;
      Min_Sub_Size : Positive := 1;
      Max_Sub_Size : Positive := 3;
      Max_Word_Size : Positive := 10;
      Job_Count : Natural := 0;
      Action : Actions.Enum := Actions.Nothing;
      Ada_Dictionary : Ada.Strings.Unbounded.Unbounded_String;
      Hash_Package : Ada.Strings.Unbounded.Unbounded_String;
      Dict_Source : Dict_Sources.Enum := Dict_Sources.S_Expression;
   end record;

   overriding procedure Option
     (Handler  : in out Callback;
      Id       : in Options.Id;
      Argument : in String);

   overriding procedure Argument
     (Handler  : in out Callback;
      Argument : in String)
     is null;


   function Getopt_Config return Getopt.Configuration;
      --  Build the configuration object

   procedure Parallel_Evaluate_Dictionary
     (Job_Count : in Positive;
      Dict : in Natools.Smaz.Dictionary;
      Corpus : in Natools.Smaz.Tools.String_Lists.List;
      Compressed_Size : out Ada.Streams.Stream_Element_Count;
      Counts : out Natools.Smaz.Tools.Dictionary_Counts);
      --  Return the same results as Natools.Smaz.Tools.Evaluate_Dictionary,
      --  but hopefully more quickly, using Job_Count tasks.

   procedure Print_Dictionary
     (Filename : in String;
      Dictionary : in Natools.Smaz.Dictionary;
      Hash_Package_Name : in String := "");
   procedure Print_Dictionary
     (Output : in Ada.Text_IO.File_Type;
      Dictionary : in Natools.Smaz.Dictionary;
185
186
187
188
189
190
191



192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207

208
209
210
211
212
213
214
215
216
217
218
219
220


















































































221
222
223
224
225
226
227
            Handler.Min_Sub_Size := Positive'Value (Argument);

         when Options.Max_Sub_Size =>
            Handler.Max_Sub_Size := Positive'Value (Argument);

         when Options.Max_Word_Size =>
            Handler.Max_Word_Size := Positive'Value (Argument);



      end case;
   end Option;


   function Getopt_Config return Getopt.Configuration is
      use Getopt;
      use Options;
      R : Getopt.Configuration;
   begin
      R.Add_Option ("ada-dict",      'A', Optional_Argument, Output_Ada_Dict);
      R.Add_Option ("decode",        'd', No_Argument,       Decode);
      R.Add_Option ("dict",          'D', No_Argument,       Dictionary_Input);
      R.Add_Option ("encode",        'e', No_Argument,       Encode);
      R.Add_Option ("evaluate",      'E', No_Argument,       Evaluate);
      R.Add_Option ("help",          'h', No_Argument,       Help);
      R.Add_Option ("hash-pkg",      'H', Required_Argument, Output_Hash);

      R.Add_Option ("sx-dict",       'L', No_Argument,       Sx_Dict_Output);
      R.Add_Option ("min-substring", 'm', Required_Argument, Min_Sub_Size);
      R.Add_Option ("max-substring", 'M', Required_Argument, Max_Sub_Size);
      R.Add_Option ("stats",         's', No_Argument,       Stat_Output);
      R.Add_Option ("no-stats",      'S', No_Argument,       No_Stat_Output);
      R.Add_Option ("word-list",     'w', No_Argument,       Word_List_Input);
      R.Add_Option ("max-word-len",  'W', Required_Argument, Max_Word_Size);
      R.Add_Option ("s-expr",        'x', No_Argument,       Sx_Output);
      R.Add_Option ("no-s-expr",     'X', No_Argument,       No_Sx_Output);

      return R;
   end Getopt_Config;




















































































   procedure Print_Dictionary
     (Filename : in String;
      Dictionary : in Natools.Smaz.Dictionary;
      Hash_Package_Name : in String := "") is
   begin
      if Filename = "-" then







>
>
>
















>













>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
            Handler.Min_Sub_Size := Positive'Value (Argument);

         when Options.Max_Sub_Size =>
            Handler.Max_Sub_Size := Positive'Value (Argument);

         when Options.Max_Word_Size =>
            Handler.Max_Word_Size := Positive'Value (Argument);

         when Options.Job_Count =>
            Handler.Job_Count := Natural'Value (Argument);
      end case;
   end Option;


   function Getopt_Config return Getopt.Configuration is
      use Getopt;
      use Options;
      R : Getopt.Configuration;
   begin
      R.Add_Option ("ada-dict",      'A', Optional_Argument, Output_Ada_Dict);
      R.Add_Option ("decode",        'd', No_Argument,       Decode);
      R.Add_Option ("dict",          'D', No_Argument,       Dictionary_Input);
      R.Add_Option ("encode",        'e', No_Argument,       Encode);
      R.Add_Option ("evaluate",      'E', No_Argument,       Evaluate);
      R.Add_Option ("help",          'h', No_Argument,       Help);
      R.Add_Option ("hash-pkg",      'H', Required_Argument, Output_Hash);
      R.Add_Option ("jobs",          'j', Required_Argument, Job_Count);
      R.Add_Option ("sx-dict",       'L', No_Argument,       Sx_Dict_Output);
      R.Add_Option ("min-substring", 'm', Required_Argument, Min_Sub_Size);
      R.Add_Option ("max-substring", 'M', Required_Argument, Max_Sub_Size);
      R.Add_Option ("stats",         's', No_Argument,       Stat_Output);
      R.Add_Option ("no-stats",      'S', No_Argument,       No_Stat_Output);
      R.Add_Option ("word-list",     'w', No_Argument,       Word_List_Input);
      R.Add_Option ("max-word-len",  'W', Required_Argument, Max_Word_Size);
      R.Add_Option ("s-expr",        'x', No_Argument,       Sx_Output);
      R.Add_Option ("no-s-expr",     'X', No_Argument,       No_Sx_Output);

      return R;
   end Getopt_Config;


   procedure Parallel_Evaluate_Dictionary
     (Job_Count : in Positive;
      Dict : in Natools.Smaz.Dictionary;
      Corpus : in Natools.Smaz.Tools.String_Lists.List;
      Compressed_Size : out Ada.Streams.Stream_Element_Count;
      Counts : out Natools.Smaz.Tools.Dictionary_Counts)
   is
      package String_Lists renames Natools.Smaz.Tools.String_Lists;

      type State is record
         Position : String_Lists.Cursor;
         Compressed_Size : Ada.Streams.Stream_Element_Count;
         Counts : Natools.Smaz.Tools.Dictionary_Counts;
      end record;

      procedure Initialize_Job
        (Global : in out String_Lists.Cursor;
         Job : out State);

      procedure Do_Job (Job : in out State);

      procedure Gather_Result
        (Global : in out String_Lists.Cursor;
         Job : in State);

      function Is_Finished (Global : in String_Lists.Cursor) return Boolean;


      procedure Initialize_Job
        (Global : in out String_Lists.Cursor;
         Job : out State) is
      begin
         Job := (Position => Global,
                 Compressed_Size => 0,
                 Counts => (others => 0));
         String_Lists.Next (Global);
      end Initialize_Job;


      procedure Do_Job (Job : in out State) is
      begin
         Natools.Smaz.Tools.Evaluate_Dictionary_Partial
           (Dict,
            String_Lists.Element (Job.Position),
            Job.Compressed_Size,
            Job.Counts);
      end Do_Job;


      procedure Gather_Result
        (Global : in out String_Lists.Cursor;
         Job : in State)
      is
         pragma Unreferenced (Global);
         use type Ada.Streams.Stream_Element_Count;
         use type Natools.Smaz.Tools.String_Count;
      begin
         Compressed_Size := Compressed_Size + Job.Compressed_Size;

         for I in Counts'Range loop
            Counts (I) := Counts (I) + Job.Counts (I);
         end loop;
      end Gather_Result;


      function Is_Finished (Global : in String_Lists.Cursor) return Boolean is
      begin
         return not String_Lists.Has_Element (Global);
      end Is_Finished;


      procedure Parallel_Run is new Natools.Parallelism.Single_Accumulator_Run
        (String_Lists.Cursor, State);

      Cursor : String_Lists.Cursor := String_Lists.First (Corpus);
   begin
      Compressed_Size := 0;
      Counts := (others => 0);
      Parallel_Run (Cursor, Job_Count);
   end Parallel_Evaluate_Dictionary;


   procedure Print_Dictionary
     (Filename : in String;
      Dictionary : in Natools.Smaz.Dictionary;
      Hash_Package_Name : in String := "") is
   begin
      if Filename = "-" then
356
357
358
359
360
361
362





363
364
365
366
367
368
369
               Put_Line (Output, Indent & Indent
                 & "Maximum word size when building a dictionary");

            when Options.Evaluate =>
               New_Line (Output);
               Put_Line (Output, Indent & Indent
                 & "Evaluate the dictionary on the input given corpus");





         end case;
      end loop;
   end Print_Help;

   function To_Dictionary
     (Handler : in Callback'Class;
      Input : in Natools.Smaz.Tools.String_Lists.List)







>
>
>
>
>







454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
               Put_Line (Output, Indent & Indent
                 & "Maximum word size when building a dictionary");

            when Options.Evaluate =>
               New_Line (Output);
               Put_Line (Output, Indent & Indent
                 & "Evaluate the dictionary on the input given corpus");

            when Options.Job_Count =>
               New_Line (Output);
               Put_Line (Output, Indent & Indent
                 & "Number of parallel jobs in long calculations");
         end case;
      end loop;
   end Print_Help;

   function To_Dictionary
     (Handler : in Callback'Class;
      Input : in Natools.Smaz.Tools.String_Lists.List)
567
568
569
570
571
572
573




574
575

576
577
578
579
580
581
582
            end if;

         when Actions.Evaluate =>
            declare
               Total_Size : Ada.Streams.Stream_Element_Count;
               Counts : Natools.Smaz.Tools.Dictionary_Counts;
            begin




               Natools.Smaz.Tools.Evaluate_Dictionary
                 (Dictionary, Input_Data, Total_Size, Counts);


               if Handler.Sx_Output then
                  Sx_Output.Open_List;
                  Sx_Output.Append_String (Ada.Strings.Fixed.Trim
                    (Ada.Streams.Stream_Element_Count'Image (Total_Size),
                     Ada.Strings.Both));








>
>
>
>
|
|
>







670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
            end if;

         when Actions.Evaluate =>
            declare
               Total_Size : Ada.Streams.Stream_Element_Count;
               Counts : Natools.Smaz.Tools.Dictionary_Counts;
            begin
               if Handler.Job_Count > 0 then
                  Parallel_Evaluate_Dictionary (Handler.Job_Count,
                     Dictionary, Input_Data, Total_Size, Counts);
               else
                  Natools.Smaz.Tools.Evaluate_Dictionary
                    (Dictionary, Input_Data, Total_Size, Counts);
               end if;

               if Handler.Sx_Output then
                  Sx_Output.Open_List;
                  Sx_Output.Append_String (Ada.Strings.Fixed.Trim
                    (Ada.Streams.Stream_Element_Count'Image (Total_Size),
                     Ada.Strings.Both));