Index: tools/smaz.adb ================================================================== --- tools/smaz.adb +++ tools/smaz.adb @@ -54,10 +54,14 @@ type Enum is (S_Expression, Text_List, Unoptimized_Text_List); end Dict_Sources; + + package Methods is + type Enum is (Encoded, Frequency, Gain); + end Methods; package Options is type Id is (Output_Ada_Dict, Dictionary_Input, @@ -75,11 +79,12 @@ No_Stat_Output, Text_List_Input, Fast_Text_Input, Max_Word_Size, Sx_Output, - No_Sx_Output); + No_Sx_Output, + Score_Method); end Options; package Getopt is new Natools.Getopt_Long (Options.Id); type Callback is new Getopt.Handlers.Callback with record @@ -91,10 +96,11 @@ Min_Sub_Size : Positive := 1; Max_Sub_Size : Positive := 3; Max_Word_Size : Positive := 10; Job_Count : Natural := 0; Filter_Threshold : Natools.Smaz.Tools.String_Count := 0; + Score_Method : Methods.Enum := Methods.Encoded; Action : Actions.Enum := Actions.Nothing; Ada_Dictionary : Ada.Strings.Unbounded.Unbounded_String; Hash_Package : Ada.Strings.Unbounded.Unbounded_String; Dict_Source : Dict_Sources.Enum := Dict_Sources.S_Expression; end record; @@ -120,33 +126,35 @@ -- depending on Job_Count. function Getopt_Config return Getopt.Configuration; -- Build the configuration object - procedure Optimization_Round - (Dict : in out Holders.Holder; - Score : in out Ada.Streams.Stream_Element_Count; - Counts : in out Natools.Smaz.Tools.Dictionary_Counts; - Pending_Words : in out Natools.Smaz.Tools.String_Lists.List; - Input_Texts : in Natools.Smaz.Tools.String_Lists.List; - Job_Count : in Natural; - Updated : out Boolean); - -- Try to improve on Dict by replacing a single entry from it with - -- one of the substring in Pending_Words. - function Length (Dictionary : in Natools.Smaz.Dictionary; E : in Ada.Streams.Stream_Element) return Score_Value is (Natools.Smaz.Dict_Entry (Dictionary, E)'Length); -- Length of a dictionary entry + procedure Optimization_Round + (Dict : in out Holders.Holder; + Score : in out Ada.Streams.Stream_Element_Count; + Counts : in out Natools.Smaz.Tools.Dictionary_Counts; + Pending_Words : in out Natools.Smaz.Tools.String_Lists.List; + Input_Texts : in Natools.Smaz.Tools.String_Lists.List; + Job_Count : in Natural; + Method : in Methods.Enum; + Updated : out Boolean); + -- Try to improve on Dict by replacing a single entry from it with + -- one of the substring in Pending_Words. + function Optimize_Dictionary (Base : in Natools.Smaz.Dictionary; Pending_Words : in Natools.Smaz.Tools.String_Lists.List; Input_Texts : in Natools.Smaz.Tools.String_Lists.List; - Job_Count : in Natural) + Job_Count : in Natural; + Method : in Methods.Enum) return Natools.Smaz.Dictionary; -- Optimize the dictionary on Input_Texts, starting with Base and -- adding substrings from Pending_Words. procedure Parallel_Evaluate_Dictionary @@ -195,19 +203,32 @@ E : Ada.Streams.Stream_Element) return Score_Value is (Score_Value (Counts (E)) * (Length (Dictionary, E) - 1)); -- Score value using the number of bytes saved using E + function Score + (Dictionary : in Natools.Smaz.Dictionary; + Counts : in Natools.Smaz.Tools.Dictionary_Counts; + E : in Ada.Streams.Stream_Element; + Method : in Methods.Enum) + return Score_Value + is (case Method is + when Methods.Encoded => Score_Encoded (Dictionary, Counts, E), + when Methods.Frequency => Score_Frequency (Dictionary, Counts, E), + when Methods.Gain => Score_Gain (Dictionary, Counts, E)); + -- Scare value with dynamically chosen method + function To_Dictionary (Handler : in Callback'Class; Input : in Natools.Smaz.Tools.String_Lists.List) return Natools.Smaz.Dictionary; -- Convert the input into a dictionary given the option in Handler function Worst_Index (Dict : in Natools.Smaz.Dictionary; - Counts : in Natools.Smaz.Tools.Dictionary_Counts) + Counts : in Natools.Smaz.Tools.Dictionary_Counts; + Method : in Methods.Enum) return Ada.Streams.Stream_Element; -- Remove the worstly-scored item from Dict overriding procedure Option @@ -285,10 +306,13 @@ Handler.Job_Count := Natural'Value (Argument); when Options.Filter_Threshold => Handler.Filter_Threshold := Natools.Smaz.Tools.String_Count'Value (Argument); + + when Options.Score_Method => + Handler.Score_Method := Methods.Enum'Value (Argument); end case; end Option; procedure Evaluate_Dictionary @@ -349,10 +373,11 @@ R.Add_Option ("text-list", 't', No_Argument, Text_List_Input); R.Add_Option ("fast-text-list", 'T', No_Argument, Fast_Text_Input); R.Add_Option ("max-word-len", 'W', Required_Argument, Max_Word_Size); R.Add_Option ("s-expr", 'x', No_Argument, Sx_Output); R.Add_Option ("no-s-expr", 'X', No_Argument, No_Sx_Output); + R.Add_Option ("score-method", Required_Argument, Score_Method); return R; end Getopt_Config; @@ -361,18 +386,19 @@ Score : in out Ada.Streams.Stream_Element_Count; Counts : in out Natools.Smaz.Tools.Dictionary_Counts; Pending_Words : in out Natools.Smaz.Tools.String_Lists.List; Input_Texts : in Natools.Smaz.Tools.String_Lists.List; Job_Count : in Natural; + Method : in Methods.Enum; Updated : out Boolean) is use type Ada.Streams.Stream_Element_Offset; New_Value : Ada.Strings.Unbounded.Unbounded_String; New_Position : Natools.Smaz.Tools.String_Lists.Cursor; Worst_Index : constant Ada.Streams.Stream_Element - := Smaz.Worst_Index (Dict.Element, Counts); + := Smaz.Worst_Index (Dict.Element, Counts, Method); Worst_Value : constant String := Natools.Smaz.Dict_Entry (Dict.Element, Worst_Index); Worst_Count : constant Natools.Smaz.Tools.String_Count := Counts (Worst_Index); Base : constant Natools.Smaz.Dictionary @@ -428,11 +454,12 @@ function Optimize_Dictionary (Base : in Natools.Smaz.Dictionary; Pending_Words : in Natools.Smaz.Tools.String_Lists.List; Input_Texts : in Natools.Smaz.Tools.String_Lists.List; - Job_Count : in Natural) + Job_Count : in Natural; + Method : in Methods.Enum) return Natools.Smaz.Dictionary is Holder : Holders.Holder := Holders.To_Holder (Base); Pending : Natools.Smaz.Tools.String_Lists.List := Pending_Words; Score : Ada.Streams.Stream_Element_Count; @@ -447,10 +474,11 @@ Score, Counts, Pending, Input_Texts, Job_Count, + Method, Running); end loop; return Holder.Element; end Optimize_Dictionary; @@ -704,10 +732,16 @@ Put_Line (Output, " "); Put_Line (Output, Indent & Indent & "Before building a dictionary from substrings, remove"); Put_Line (Output, Indent & Indent & "substrings whose count is below the threshold."); + + when Options.Score_Method => + Put_Line (Output, " "); + Put_Line (Output, Indent & Indent + & "Select heuristic method to replace dictionary items" + & " during optimization"); end case; end loop; end Print_Help; @@ -752,11 +786,12 @@ return Optimize_Dictionary (Natools.Smaz.Tools.To_Dictionary (Selected, True), Pending, Input, - Handler.Job_Count); + Handler.Job_Count, + Handler.Score_Method); end; else return Natools.Smaz.Tools.To_Dictionary (Natools.Smaz.Tools.Simple_Dictionary (Counter, 254), True); @@ -766,19 +801,20 @@ end To_Dictionary; function Worst_Index (Dict : in Natools.Smaz.Dictionary; - Counts : in Natools.Smaz.Tools.Dictionary_Counts) + Counts : in Natools.Smaz.Tools.Dictionary_Counts; + Method : in Methods.Enum) return Ada.Streams.Stream_Element is Result : Ada.Streams.Stream_Element := 0; Worst_Score : Score_Value := Score_Encoded (Dict, Counts, 0); S : Score_Value; begin for I in 1 .. Dict.Dict_Last loop - S := Score_Encoded (Dict, Counts, I); + S := Score (Dict, Counts, I, Method); if S < Worst_Score then Result := I; Worst_Score := S; end if;