Natools

Check-in [fb7d230fe5]
Login
Overview
Comment:tools/smaz: actually implement dictionary optimization
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: fb7d230fe5900baca2b05464b4924ec3a8d2b7fc
User & Date: nat on 2016-10-29 20:29:20
Other Links: manifest | tags
Context
2016-10-30
18:19
tools/sxcat: add a command-line option to output a list of input atoms check-in: ac88f5abfb user: nat tags: trunk
2016-10-29
20:29
tools/smaz: actually implement dictionary optimization check-in: fb7d230fe5 user: nat tags: trunk
2016-10-28
20:56
tools/smaz: refactor scores out of the evaluation block check-in: fbe80ac184 user: nat tags: trunk
Changes

Modified tools/smaz.adb from [7214a27560] to [fa5ab81891].

199
200
201
202
203
204
205






206
207
208
209
210
211
212
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218







+
+
+
+
+
+








   function To_Dictionary
     (Handler : in Callback'Class;
      Input : in Natools.Smaz.Tools.String_Lists.List)
     return Natools.Smaz.Dictionary;
      --  Convert the input into a dictionary given the option in Handler

   function Worst_Index
     (Dict : in Natools.Smaz.Dictionary;
      Counts : in Natools.Smaz.Tools.Dictionary_Counts)
     return Ada.Streams.Stream_Element;
      --  Remove the worstly-scored item from Dict


   overriding procedure Option
     (Handler  : in out Callback;
      Id       : in Options.Id;
      Argument : in String) is
   begin
      case Id is
355
356
357
358
359
360
361

362
363
364
365
















366
367
368
369









































370
371
372
373
374
375
376
361
362
363
364
365
366
367
368




369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384




385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432







+
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







      Score : in out Ada.Streams.Stream_Element_Count;
      Counts : in out Natools.Smaz.Tools.Dictionary_Counts;
      Pending_Words : in out Natools.Smaz.Tools.String_Lists.List;
      Input_Texts : in Natools.Smaz.Tools.String_Lists.List;
      Job_Count : in Natural;
      Updated : out Boolean)
   is
      use type Ada.Streams.Stream_Element_Offset;
      pragma Unreferenced (Dict);
      pragma Unreferenced (Score);
      pragma Unreferenced (Counts);
      pragma Unreferenced (Pending_Words);

      New_Value : Ada.Strings.Unbounded.Unbounded_String;
      New_Position : Natools.Smaz.Tools.String_Lists.Cursor;
      Worst_Index : constant Ada.Streams.Stream_Element
        := Smaz.Worst_Index (Dict.Element, Counts);
      Worst_Value : constant String
        := Natools.Smaz.Dict_Entry (Dict.Element, Worst_Index);
      Worst_Count : constant Natools.Smaz.Tools.String_Count
        := Counts (Worst_Index);
      Base : constant Natools.Smaz.Dictionary
        := Natools.Smaz.Tools.Remove_Element (Dict.Element, Worst_Index);
      Old_Score : constant Ada.Streams.Stream_Element_Count := Score;
   begin
      Updated := False;

      for Position in Pending_Words.Iterate loop
      pragma Unreferenced (Input_Texts);
      pragma Unreferenced (Job_Count);
   begin
      Updated := False;
         declare
            Word : constant String
              := Natools.Smaz.Tools.String_Lists.Element (Position);
            New_Dict : constant Natools.Smaz.Dictionary
              := Natools.Smaz.Tools.Append_String (Base, Word);
            New_Score : Ada.Streams.Stream_Element_Count;
            New_Counts : Natools.Smaz.Tools.Dictionary_Counts;
         begin
            Evaluate_Dictionary
              (Job_Count, New_Dict, Input_Texts, New_Score, New_Counts);

            if New_Score < Score then
               Dict := Holders.To_Holder (New_Dict);
               Score := New_Score;
               Counts := New_Counts;
               New_Value := Ada.Strings.Unbounded.To_Unbounded_String (Word);
               New_Position := Position;
               Updated := True;
            end if;
         end;
      end loop;

      if Updated then
         Pending_Words.Delete (New_Position);
         Pending_Words.Append (Worst_Value);

         Ada.Text_IO.Put_Line
           (Ada.Text_IO.Current_Error,
            "Removing"
            & Worst_Count'Img & "x "
            & Natools.String_Escapes.C_Escape_Hex (Worst_Value, True)
            & ", adding"
            & Counts (Dict.Element.Dict_Last)'Img & "x "
            & Natools.String_Escapes.C_Escape_Hex
               (Ada.Strings.Unbounded.To_String (New_Value), True)
            & ", size"
            & Score'Img
            & " ("
            & Ada.Streams.Stream_Element_Offset'Image (Score - Old_Score)
            & ')');
      end if;
   end Optimization_Round;


   function Optimize_Dictionary
     (Base : in Natools.Smaz.Dictionary;
      Pending_Words : in Natools.Smaz.Tools.String_Lists.List;
      Input_Texts : in Natools.Smaz.Tools.String_Lists.List;
704
705
706
707
708
709
710























711
712
713
714
715
716
717
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796







+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







                  return Natools.Smaz.Tools.To_Dictionary
                    (Natools.Smaz.Tools.Simple_Dictionary (Counter, 254),
                     True);
               end if;
            end;
      end case;
   end To_Dictionary;


   function Worst_Index
     (Dict : in Natools.Smaz.Dictionary;
      Counts : in Natools.Smaz.Tools.Dictionary_Counts)
     return Ada.Streams.Stream_Element
   is
      Result : Ada.Streams.Stream_Element := 0;
      Worst_Score : Score_Value := Score_Encoded (Dict, Counts, 0);
      S : Score_Value;
   begin
      for I in 1 .. Dict.Dict_Last loop
         S := Score_Encoded (Dict, Counts, I);

         if S < Worst_Score then
            Result := I;
            Worst_Score := S;
         end if;
      end loop;

      return Result;
   end Worst_Index;


   Opt_Config : constant Getopt.Configuration := Getopt_Config;
   Handler : Callback;
   Input_List, Input_Data : Natools.Smaz.Tools.String_Lists.List;
begin
   Process_Command_Line :
   begin