Natools

Check-in [b2dbec1810]
Login
Overview
Comment:tools/smaz: refactor word count construction
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: b2dbec1810052675e57598e59cd3a5d0b125d1b0
User & Date: nat on 2017-05-13 20:47:16
Other Links: manifest | tags
Context
2017-05-14
20:16
tools/smaz: implement forced words for unoptimized dictionary generation check-in: bb325e8d12 user: nat tags: trunk
2017-05-13
20:47
tools/smaz: refactor word count construction check-in: b2dbec1810 user: nat tags: trunk
2017-05-12
22:26
tools/smaz: move Adjust_Dictionary call into To_Dictionary function check-in: 3b85da8290 user: nat tags: trunk
Changes

Modified tools/smaz.adb from [28df8aef20] to [d59b535a0b].

354
355
356
357
358
359
360






361
362
363
364
365
366
367
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373







+
+
+
+
+
+







         Code : in Dictionary_Entry)
        return Natools.S_Expressions.Atom;
         --  S-expression image of Code

      function Is_In_Dict (Dict : Dictionary; Word : String) return Boolean;
         --  Return whether Word is in Dict (inefficient)

      function Make_Word_Counter
        (Handler : in Callback'Class;
         Input : in String_Lists.List)
        return Word_Counter;
         --  Make a word counter from an input word list

      procedure Optimization_Round
        (Dict : in out Holders.Holder;
         Score : in out Ada.Streams.Stream_Element_Count;
         Counts : in out Dictionary_Counts;
         Pending_Words : in out String_Lists.List;
         Input_Texts : in String_Lists.List;
         Job_Count : in Natural;
510
511
512
513
514
515
516




























517
518
519
520
521
522
523
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557







+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







               return True;
            end if;
         end loop;

         return False;
      end Is_In_Dict;


      function Make_Word_Counter
        (Handler : in Callback'Class;
         Input : in String_Lists.List)
        return Word_Counter
      is
         use type Natools.Smaz_Tools.String_Count;
         Counter : Word_Counter;
      begin
         for S of Input loop
            Add_Substrings
              (Counter, S,
               Handler.Min_Sub_Size, Handler.Max_Sub_Size);

            if Handler.Max_Word_Size > Handler.Max_Sub_Size then
               Add_Words
                 (Counter, S,
                  Handler.Max_Sub_Size + 1, Handler.Max_Word_Size);
            end if;
         end loop;

         if Handler.Filter_Threshold > 0 then
            Filter_By_Count (Counter, String_Count (Handler.Filter_Threshold));
         end if;

         return Counter;
      end Make_Word_Counter;


      procedure Optimization_Round
        (Dict : in out Holders.Holder;
         Score : in out Ada.Streams.Stream_Element_Count;
         Counts : in out Dictionary_Counts;
         Pending_Words : in out String_Lists.List;
         Input_Texts : in String_Lists.List;
1038
1039
1040
1041
1042
1043
1044
1045

1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058

1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089









1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101















1102
1103
1104
1105
1106
1107
1108
1109
1110
1072
1073
1074
1075
1076
1077
1078

1079



1080
1081
1082
1083
1084
1085
1086
1087
1088

1089
1090






























1091
1092
1093
1094
1095
1096
1097
1098
1099
1100











1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115


1116
1117
1118
1119
1120
1121
1122







-
+
-
-
-









-
+

-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+

-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-









      function To_Dictionary
        (Handler : in Callback'Class;
         Input : in String_Lists.List;
         Data_List : in String_Lists.List;
         Method : in Methods)
        return Dictionary
        return Dictionary is
      is
         use type Natools.Smaz_Tools.String_Count;
         use type Dict_Sources.Enum;
      begin
         case Handler.Dict_Source is
            when Dict_Sources.S_Expression =>
               return Adjust_Dictionary
                 (Handler,
                  To_Dictionary (Input, Handler.Vlen_Verbatim),
                  Data_List,
                  Method);

            when Dict_Sources.Text_List | Dict_Sources.Unoptimized_Text_List =>
            when Dict_Sources.Text_List =>
               declare
                  Counter : Word_Counter;
               begin
                  for S of Input loop
                     Add_Substrings
                       (Counter, S,
                        Handler.Min_Sub_Size, Handler.Max_Sub_Size);

                     if Handler.Max_Word_Size > Handler.Max_Sub_Size then
                        Add_Words
                          (Counter, S,
                           Handler.Max_Sub_Size + 1, Handler.Max_Word_Size);
                     end if;
                  end loop;

                  if Handler.Filter_Threshold > 0 then
                     Filter_By_Count
                       (Counter, String_Count (Handler.Filter_Threshold));
                  end if;

                  if Handler.Dict_Source = Dict_Sources.Text_List then
                     declare
                        Selected, Pending : String_Lists.List;
                     begin
                        Simple_Dictionary_And_Pending
                          (Counter,
                           Handler.Dict_Size,
                           Selected,
                           Pending,
                           Method,
                           Handler.Max_Pending);
                  Selected, Pending : String_Lists.List;
               begin
                  Simple_Dictionary_And_Pending
                    (Make_Word_Counter (Handler, Input),
                     Handler.Dict_Size,
                     Selected,
                     Pending,
                     Method,
                     Handler.Max_Pending);

                        return Optimize_Dictionary
                          (To_Dictionary (Selected, Handler.Vlen_Verbatim),
                           Pending,
                           Input,
                           Handler.Job_Count,
                           Method);
                     end;
                  else
                     return To_Dictionary
                       (Simple_Dictionary (Counter, Handler.Dict_Size, Method),
                        Handler.Vlen_Verbatim);
                  return Optimize_Dictionary
                    (To_Dictionary (Selected, Handler.Vlen_Verbatim),
                     Pending,
                     Input,
                     Handler.Job_Count,
                     Method);
               end;

            when Dict_Sources.Unoptimized_Text_List =>
               return To_Dictionary
                 (Simple_Dictionary
                    (Make_Word_Counter (Handler, Input),
                     Handler.Dict_Size,
                     Method),
                  Handler.Vlen_Verbatim);
                  end if;
               end;
         end case;
      end To_Dictionary;

   end Dictionary_Subprograms;