Natools

Check-in [637ebd90fa]
Login
Overview
Comment:tools/smaz: new command-line option to filter substrings by count
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 637ebd90fa8c57ea1455e889a2478c4b9758007f
User & Date: nat on 2016-10-21 19:32:41
Other Links: manifest | tags
Context
2016-10-22
19:21
tools/smaz: replace "word list" with clearer "[sample] text list" check-in: db2278efbb user: nat tags: trunk
2016-10-21
19:32
tools/smaz: new command-line option to filter substrings by count check-in: 637ebd90fa user: nat tags: trunk
2016-10-20
20:15
smaz-tools: new primitive to filter substrings without enough counts check-in: f9d16b99f9 user: nat tags: trunk
Changes

Modified tools/smaz.adb from [ef8c32c6cd] to [71cfa2a6e9].

53
54
55
56
57
58
59

60
61
62
63
64
65
66
   package Options is
      type Id is
        (Output_Ada_Dict,
         Dictionary_Input,
         Decode,
         Encode,
         Evaluate,

         Output_Hash,
         Job_Count,
         Help,
         Sx_Dict_Output,
         Min_Sub_Size,
         Max_Sub_Size,
         Stat_Output,







>







53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
   package Options is
      type Id is
        (Output_Ada_Dict,
         Dictionary_Input,
         Decode,
         Encode,
         Evaluate,
         Filter_Threshold,
         Output_Hash,
         Job_Count,
         Help,
         Sx_Dict_Output,
         Min_Sub_Size,
         Max_Sub_Size,
         Stat_Output,
79
80
81
82
83
84
85

86
87
88
89
90
91
92
      Stat_Output : Boolean := False;
      Sx_Output : Boolean := False;
      Sx_Dict_Output : Boolean := False;
      Min_Sub_Size : Positive := 1;
      Max_Sub_Size : Positive := 3;
      Max_Word_Size : Positive := 10;
      Job_Count : Natural := 0;

      Action : Actions.Enum := Actions.Nothing;
      Ada_Dictionary : Ada.Strings.Unbounded.Unbounded_String;
      Hash_Package : Ada.Strings.Unbounded.Unbounded_String;
      Dict_Source : Dict_Sources.Enum := Dict_Sources.S_Expression;
   end record;

   overriding procedure Option







>







80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
      Stat_Output : Boolean := False;
      Sx_Output : Boolean := False;
      Sx_Dict_Output : Boolean := False;
      Min_Sub_Size : Positive := 1;
      Max_Sub_Size : Positive := 3;
      Max_Word_Size : Positive := 10;
      Job_Count : Natural := 0;
      Filter_Threshold : Natools.Smaz.Tools.String_Count := 0;
      Action : Actions.Enum := Actions.Nothing;
      Ada_Dictionary : Ada.Strings.Unbounded.Unbounded_String;
      Hash_Package : Ada.Strings.Unbounded.Unbounded_String;
      Dict_Source : Dict_Sources.Enum := Dict_Sources.S_Expression;
   end record;

   overriding procedure Option
200
201
202
203
204
205
206




207
208
209
210
211
212
213
214
215
216
217
218
219
220

221
222
223
224
225
226
227
            Handler.Max_Sub_Size := Positive'Value (Argument);

         when Options.Max_Word_Size =>
            Handler.Max_Word_Size := Positive'Value (Argument);

         when Options.Job_Count =>
            Handler.Job_Count := Natural'Value (Argument);




      end case;
   end Option;


   function Getopt_Config return Getopt.Configuration is
      use Getopt;
      use Options;
      R : Getopt.Configuration;
   begin
      R.Add_Option ("ada-dict",      'A', Optional_Argument, Output_Ada_Dict);
      R.Add_Option ("decode",        'd', No_Argument,       Decode);
      R.Add_Option ("dict",          'D', No_Argument,       Dictionary_Input);
      R.Add_Option ("encode",        'e', No_Argument,       Encode);
      R.Add_Option ("evaluate",      'E', No_Argument,       Evaluate);

      R.Add_Option ("help",          'h', No_Argument,       Help);
      R.Add_Option ("hash-pkg",      'H', Required_Argument, Output_Hash);
      R.Add_Option ("jobs",          'j', Required_Argument, Job_Count);
      R.Add_Option ("sx-dict",       'L', No_Argument,       Sx_Dict_Output);
      R.Add_Option ("min-substring", 'm', Required_Argument, Min_Sub_Size);
      R.Add_Option ("max-substring", 'M', Required_Argument, Max_Sub_Size);
      R.Add_Option ("stats",         's', No_Argument,       Stat_Output);







>
>
>
>














>







202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
            Handler.Max_Sub_Size := Positive'Value (Argument);

         when Options.Max_Word_Size =>
            Handler.Max_Word_Size := Positive'Value (Argument);

         when Options.Job_Count =>
            Handler.Job_Count := Natural'Value (Argument);

         when Options.Filter_Threshold =>
            Handler.Filter_Threshold
              := Natools.Smaz.Tools.String_Count'Value (Argument);
      end case;
   end Option;


   function Getopt_Config return Getopt.Configuration is
      use Getopt;
      use Options;
      R : Getopt.Configuration;
   begin
      R.Add_Option ("ada-dict",      'A', Optional_Argument, Output_Ada_Dict);
      R.Add_Option ("decode",        'd', No_Argument,       Decode);
      R.Add_Option ("dict",          'D', No_Argument,       Dictionary_Input);
      R.Add_Option ("encode",        'e', No_Argument,       Encode);
      R.Add_Option ("evaluate",      'E', No_Argument,       Evaluate);
      R.Add_Option ("filter",        'F', Required_Argument, Filter_Threshold);
      R.Add_Option ("help",          'h', No_Argument,       Help);
      R.Add_Option ("hash-pkg",      'H', Required_Argument, Output_Hash);
      R.Add_Option ("jobs",          'j', Required_Argument, Job_Count);
      R.Add_Option ("sx-dict",       'L', No_Argument,       Sx_Dict_Output);
      R.Add_Option ("min-substring", 'm', Required_Argument, Min_Sub_Size);
      R.Add_Option ("max-substring", 'M', Required_Argument, Max_Sub_Size);
      R.Add_Option ("stats",         's', No_Argument,       Stat_Output);
467
468
469
470
471
472
473







474
475
476
477
478
479
480
481


482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500





501
502
503
504
505
506
507
               Put_Line (Output, Indent & Indent
                 & "Evaluate the dictionary on the input given corpus");

            when Options.Job_Count =>
               New_Line (Output);
               Put_Line (Output, Indent & Indent
                 & "Number of parallel jobs in long calculations");







         end case;
      end loop;
   end Print_Help;

   function To_Dictionary
     (Handler : in Callback'Class;
      Input : in Natools.Smaz.Tools.String_Lists.List)
     return Natools.Smaz.Dictionary is


   begin
      case Handler.Dict_Source is
         when Dict_Sources.S_Expression =>
            return Natools.Smaz.Tools.To_Dictionary (Input, True);

         when Dict_Sources.Word_List =>
            declare
               Counter : Natools.Smaz.Tools.Word_Counter;
            begin
               for S of Input loop
                  Natools.Smaz.Tools.Add_Substrings
                    (Counter, S, Handler.Min_Sub_Size, Handler.Max_Sub_Size);

                  if Handler.Max_Word_Size > Handler.Max_Sub_Size then
                     Natools.Smaz.Tools.Add_Words
                       (Counter, S,
                        Handler.Max_Sub_Size + 1, Handler.Max_Word_Size);
                  end if;
               end loop;






               return Natools.Smaz.Tools.To_Dictionary
                 (Natools.Smaz.Tools.Simple_Dictionary (Counter, 254),
                  True);
            end;
      end case;
   end To_Dictionary;







>
>
>
>
>
>
>







|
>
>



















>
>
>
>
>







474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
               Put_Line (Output, Indent & Indent
                 & "Evaluate the dictionary on the input given corpus");

            when Options.Job_Count =>
               New_Line (Output);
               Put_Line (Output, Indent & Indent
                 & "Number of parallel jobs in long calculations");

            when Options.Filter_Threshold =>
               Put_Line (Output, " <threshold>");
               Put_Line (Output, Indent & Indent
                 & "Before building a dictionary from substrings, remove");
               Put_Line (Output, Indent & Indent
                 & "substrings whose count is below the threshold.");
         end case;
      end loop;
   end Print_Help;

   function To_Dictionary
     (Handler : in Callback'Class;
      Input : in Natools.Smaz.Tools.String_Lists.List)
     return Natools.Smaz.Dictionary
   is
      use type Natools.Smaz.Tools.String_Count;
   begin
      case Handler.Dict_Source is
         when Dict_Sources.S_Expression =>
            return Natools.Smaz.Tools.To_Dictionary (Input, True);

         when Dict_Sources.Word_List =>
            declare
               Counter : Natools.Smaz.Tools.Word_Counter;
            begin
               for S of Input loop
                  Natools.Smaz.Tools.Add_Substrings
                    (Counter, S, Handler.Min_Sub_Size, Handler.Max_Sub_Size);

                  if Handler.Max_Word_Size > Handler.Max_Sub_Size then
                     Natools.Smaz.Tools.Add_Words
                       (Counter, S,
                        Handler.Max_Sub_Size + 1, Handler.Max_Word_Size);
                  end if;
               end loop;

               if Handler.Filter_Threshold > 0 then
                  Natools.Smaz.Tools.Filter_By_Count
                    (Counter, Handler.Filter_Threshold);
               end if;

               return Natools.Smaz.Tools.To_Dictionary
                 (Natools.Smaz.Tools.Simple_Dictionary (Counter, 254),
                  True);
            end;
      end case;
   end To_Dictionary;