Natools

Check-in [014ca1d01b]
Login
Overview
Comment:tools/smaz: implement forced words for optimized dictionary generation
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 014ca1d01b263357f3c6bbca35dcb70105b5702c
User & Date: nat on 2017-05-18 21:13:36
Other Links: manifest | tags
Context
2017-05-19
21:04
tools/smaz: add command-line options for min and max dictionary size check-in: ecae1e85f4 user: nat tags: trunk
2017-05-18
21:13
tools/smaz: implement forced words for optimized dictionary generation check-in: 014ca1d01b user: nat tags: trunk
2017-05-17
21:44
tools/smaz: use the new version of Worst_Index check-in: 6eaac2a01c user: nat tags: trunk
Changes

Modified tools/smaz.adb from [e22ccc7ac2] to [4b4875a3ad].

365
366
367
368
369
370
371

372
373
374
375
376
377
378
379
380
381

382
383
384
385
386
387
388

389
390
391
392
393
394
395
        return Word_Counter;
         --  Make a word counter from an input word list

      procedure Optimization_Round
        (Dict : in out Holders.Holder;
         Score : in out Ada.Streams.Stream_Element_Count;
         Counts : in out Dictionary_Counts;

         Pending_Words : in out String_Lists.List;
         Input_Texts : in String_Lists.List;
         Job_Count : in Natural;
         Method : in Methods;
         Updated : out Boolean);
      --  Try to improve on Dict by replacing a single entry from it with
      --  one of the substring in Pending_Words.

      function Optimize_Dictionary
        (Base : in Dictionary;

         Pending_Words : in String_Lists.List;
         Input_Texts : in String_Lists.List;
         Job_Count : in Natural;
         Method : in Methods)
        return Dictionary;
      --  Optimize the dictionary on Input_Texts, starting with Base and
      --  adding substrings from Pending_Words.


      procedure Parallel_Evaluate_Dictionary
        (Job_Count : in Positive;
         Dict : in Dictionary;
         Corpus : in String_Lists.List;
         Compressed_Size : out Ada.Streams.Stream_Element_Count;
         Counts : out Dictionary_Counts);







>










>






|
>







365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
        return Word_Counter;
         --  Make a word counter from an input word list

      procedure Optimization_Round
        (Dict : in out Holders.Holder;
         Score : in out Ada.Streams.Stream_Element_Count;
         Counts : in out Dictionary_Counts;
         First : in Dictionary_Entry;
         Pending_Words : in out String_Lists.List;
         Input_Texts : in String_Lists.List;
         Job_Count : in Natural;
         Method : in Methods;
         Updated : out Boolean);
      --  Try to improve on Dict by replacing a single entry from it with
      --  one of the substring in Pending_Words.

      function Optimize_Dictionary
        (Base : in Dictionary;
         First : in Dictionary_Entry;
         Pending_Words : in String_Lists.List;
         Input_Texts : in String_Lists.List;
         Job_Count : in Natural;
         Method : in Methods)
        return Dictionary;
      --  Optimize the dictionary on Input_Texts, starting with Base and
      --  adding substrings from Pending_Words. Operates only on words
      --  at First and beyond.

      procedure Parallel_Evaluate_Dictionary
        (Job_Count : in Positive;
         Dict : in Dictionary;
         Corpus : in String_Lists.List;
         Compressed_Size : out Ada.Streams.Stream_Element_Count;
         Counts : out Dictionary_Counts);
552
553
554
555
556
557
558

559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
      end Make_Word_Counter;


      procedure Optimization_Round
        (Dict : in out Holders.Holder;
         Score : in out Ada.Streams.Stream_Element_Count;
         Counts : in out Dictionary_Counts;

         Pending_Words : in out String_Lists.List;
         Input_Texts : in String_Lists.List;
         Job_Count : in Natural;
         Method : in Methods;
         Updated : out Boolean)
      is
         use type Ada.Streams.Stream_Element_Offset;

         New_Value : Ada.Strings.Unbounded.Unbounded_String;
         New_Position : String_Lists.Cursor;
         Worst_Index : constant Dictionary_Entry
           := Worst_Element
              (Dict.Element, Counts, Method,
               Dictionary_Entry'First, Last_Code (Dict.Element));
         Worst_Value : constant String
           := Dict_Entry (Dict.Element, Worst_Index);
         Worst_Count : constant String_Count := Counts (Worst_Index);
         Base : constant Dictionary
           := Remove_Element (Dict.Element, Worst_Index);
         Old_Score : constant Ada.Streams.Stream_Element_Count := Score;
      begin







>












|
<







555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575

576
577
578
579
580
581
582
      end Make_Word_Counter;


      procedure Optimization_Round
        (Dict : in out Holders.Holder;
         Score : in out Ada.Streams.Stream_Element_Count;
         Counts : in out Dictionary_Counts;
         First : in Dictionary_Entry;
         Pending_Words : in out String_Lists.List;
         Input_Texts : in String_Lists.List;
         Job_Count : in Natural;
         Method : in Methods;
         Updated : out Boolean)
      is
         use type Ada.Streams.Stream_Element_Offset;

         New_Value : Ada.Strings.Unbounded.Unbounded_String;
         New_Position : String_Lists.Cursor;
         Worst_Index : constant Dictionary_Entry
           := Worst_Element
              (Dict.Element, Counts, Method, First, Last_Code (Dict.Element));

         Worst_Value : constant String
           := Dict_Entry (Dict.Element, Worst_Index);
         Worst_Count : constant String_Count := Counts (Worst_Index);
         Base : constant Dictionary
           := Remove_Element (Dict.Element, Worst_Index);
         Old_Score : constant Ada.Streams.Stream_Element_Count := Score;
      begin
621
622
623
624
625
626
627

628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647

648
649
650
651
652
653
654
               & ')');
         end if;
      end Optimization_Round;


      function Optimize_Dictionary
        (Base : in Dictionary;

         Pending_Words : in String_Lists.List;
         Input_Texts : in String_Lists.List;
         Job_Count : in Natural;
         Method : in Methods)
        return Dictionary
      is
         Holder : Holders.Holder := Holders.To_Holder (Base);
         Pending : String_Lists.List := Pending_Words;
         Score : Ada.Streams.Stream_Element_Count;
         Counts : Dictionary_Counts;
         Running : Boolean := True;
      begin
         Evaluate_Dictionary
           (Job_Count, Base, Input_Texts, Score, Counts);

         while Running loop
            Optimization_Round
              (Holder,
               Score,
               Counts,

               Pending,
               Input_Texts,
               Job_Count,
               Method,
               Running);
         end loop;








>




















>







624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
               & ')');
         end if;
      end Optimization_Round;


      function Optimize_Dictionary
        (Base : in Dictionary;
         First : in Dictionary_Entry;
         Pending_Words : in String_Lists.List;
         Input_Texts : in String_Lists.List;
         Job_Count : in Natural;
         Method : in Methods)
        return Dictionary
      is
         Holder : Holders.Holder := Holders.To_Holder (Base);
         Pending : String_Lists.List := Pending_Words;
         Score : Ada.Streams.Stream_Element_Count;
         Counts : Dictionary_Counts;
         Running : Boolean := True;
      begin
         Evaluate_Dictionary
           (Job_Count, Base, Input_Texts, Score, Counts);

         while Running loop
            Optimization_Round
              (Holder,
               Score,
               Counts,
               First,
               Pending,
               Input_Texts,
               Job_Count,
               Method,
               Running);
         end loop;

1089
1090
1091
1092
1093
1094
1095



1096

1097










1098
1099
1100

1101
1102
1103
1104





1105
1106
1107

1108
1109
1110
1111
1112
1113
1114
                 (Handler,
                  To_Dictionary (Input, Handler.Vlen_Verbatim),
                  Data_List,
                  Method);

            when Dict_Sources.Text_List =>
               declare



                  Selected, Pending : String_Lists.List;

               begin










                  Simple_Dictionary_And_Pending
                    (Make_Word_Counter (Handler, Input),
                     Handler.Dict_Size,

                     Selected,
                     Pending,
                     Method,
                     Handler.Max_Pending);






                  return Optimize_Dictionary
                    (To_Dictionary (Selected, Handler.Vlen_Verbatim),

                     Pending,
                     Input,
                     Handler.Job_Count,
                     Method);
               end;

            when Dict_Sources.Unoptimized_Text_List =>







>
>
>

>

>
>
>
>
>
>
>
>
>
>


<
>




>
>
>
>
>



>







1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118

1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
                 (Handler,
                  To_Dictionary (Input, Handler.Vlen_Verbatim),
                  Data_List,
                  Method);

            when Dict_Sources.Text_List =>
               declare
                  Needed : constant Integer
                    := Handler.Dict_Size
                     - Natural (Handler.Forced_Words.Length);
                  Selected, Pending : String_Lists.List;
                  First : Dictionary_Entry := Dictionary_Entry'First;
               begin
                  if Needed <= 0 then
                     for Word of reverse Handler.Forced_Words loop
                        Selected.Prepend (Word);
                        if Positive (Selected.Length) = Handler.Dict_Size then
                           return To_Dictionary
                             (Selected, Handler.Vlen_Verbatim);
                        end if;
                     end loop;
                  end if;

                  Simple_Dictionary_And_Pending
                    (Make_Word_Counter (Handler, Input),

                     Needed,
                     Selected,
                     Pending,
                     Method,
                     Handler.Max_Pending);

                  for Word of reverse Handler.Forced_Words loop
                     Selected.Prepend (Word);
                     First := Dictionary_Entry'Succ (First);
                  end loop;

                  return Optimize_Dictionary
                    (To_Dictionary (Selected, Handler.Vlen_Verbatim),
                     First,
                     Pending,
                     Input,
                     Handler.Job_Count,
                     Method);
               end;

            when Dict_Sources.Unoptimized_Text_List =>