Overview
Comment: | tools/smaz: new command-line option to filter substrings by count |
---|---|
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA1: |
637ebd90fa8c57ea1455e889a2478c4b |
User & Date: | nat on 2016-10-21 19:32:41 |
Other Links: | manifest | tags |
Context
2016-10-22
| ||
19:21 | tools/smaz: replace "word list" with clearer "[sample] text list" check-in: db2278efbb user: nat tags: trunk | |
2016-10-21
| ||
19:32 | tools/smaz: new command-line option to filter substrings by count check-in: 637ebd90fa user: nat tags: trunk | |
2016-10-20
| ||
20:15 | smaz-tools: new primitive to filter substrings without enough counts check-in: f9d16b99f9 user: nat tags: trunk | |
Changes
Modified tools/smaz.adb from [ef8c32c6cd] to [71cfa2a6e9].
︙ | ︙ | |||
53 54 55 56 57 58 59 60 61 62 63 64 65 66 | package Options is type Id is (Output_Ada_Dict, Dictionary_Input, Decode, Encode, Evaluate, Output_Hash, Job_Count, Help, Sx_Dict_Output, Min_Sub_Size, Max_Sub_Size, Stat_Output, | > | 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 | package Options is type Id is (Output_Ada_Dict, Dictionary_Input, Decode, Encode, Evaluate, Filter_Threshold, Output_Hash, Job_Count, Help, Sx_Dict_Output, Min_Sub_Size, Max_Sub_Size, Stat_Output, |
︙ | ︙ | |||
79 80 81 82 83 84 85 86 87 88 89 90 91 92 | Stat_Output : Boolean := False; Sx_Output : Boolean := False; Sx_Dict_Output : Boolean := False; Min_Sub_Size : Positive := 1; Max_Sub_Size : Positive := 3; Max_Word_Size : Positive := 10; Job_Count : Natural := 0; Action : Actions.Enum := Actions.Nothing; Ada_Dictionary : Ada.Strings.Unbounded.Unbounded_String; Hash_Package : Ada.Strings.Unbounded.Unbounded_String; Dict_Source : Dict_Sources.Enum := Dict_Sources.S_Expression; end record; overriding procedure Option | > | 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 | Stat_Output : Boolean := False; Sx_Output : Boolean := False; Sx_Dict_Output : Boolean := False; Min_Sub_Size : Positive := 1; Max_Sub_Size : Positive := 3; Max_Word_Size : Positive := 10; Job_Count : Natural := 0; Filter_Threshold : Natools.Smaz.Tools.String_Count := 0; Action : Actions.Enum := Actions.Nothing; Ada_Dictionary : Ada.Strings.Unbounded.Unbounded_String; Hash_Package : Ada.Strings.Unbounded.Unbounded_String; Dict_Source : Dict_Sources.Enum := Dict_Sources.S_Expression; end record; overriding procedure Option |
︙ | ︙ | |||
200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 | Handler.Max_Sub_Size := Positive'Value (Argument); when Options.Max_Word_Size => Handler.Max_Word_Size := Positive'Value (Argument); when Options.Job_Count => Handler.Job_Count := Natural'Value (Argument); end case; end Option; function Getopt_Config return Getopt.Configuration is use Getopt; use Options; R : Getopt.Configuration; begin R.Add_Option ("ada-dict", 'A', Optional_Argument, Output_Ada_Dict); R.Add_Option ("decode", 'd', No_Argument, Decode); R.Add_Option ("dict", 'D', No_Argument, Dictionary_Input); R.Add_Option ("encode", 'e', No_Argument, Encode); R.Add_Option ("evaluate", 'E', No_Argument, Evaluate); R.Add_Option ("help", 'h', No_Argument, Help); R.Add_Option ("hash-pkg", 'H', Required_Argument, Output_Hash); R.Add_Option ("jobs", 'j', Required_Argument, Job_Count); R.Add_Option ("sx-dict", 'L', No_Argument, Sx_Dict_Output); R.Add_Option ("min-substring", 'm', Required_Argument, Min_Sub_Size); R.Add_Option ("max-substring", 'M', Required_Argument, Max_Sub_Size); R.Add_Option ("stats", 's', No_Argument, Stat_Output); | > > > > > | 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 | Handler.Max_Sub_Size := Positive'Value (Argument); when Options.Max_Word_Size => Handler.Max_Word_Size := Positive'Value (Argument); when Options.Job_Count => Handler.Job_Count := Natural'Value (Argument); when Options.Filter_Threshold => Handler.Filter_Threshold := Natools.Smaz.Tools.String_Count'Value (Argument); end case; end Option; function Getopt_Config return Getopt.Configuration is use Getopt; use Options; R : Getopt.Configuration; begin R.Add_Option ("ada-dict", 'A', Optional_Argument, Output_Ada_Dict); R.Add_Option ("decode", 'd', No_Argument, Decode); R.Add_Option ("dict", 'D', No_Argument, Dictionary_Input); R.Add_Option ("encode", 'e', No_Argument, Encode); R.Add_Option ("evaluate", 'E', No_Argument, Evaluate); R.Add_Option ("filter", 'F', Required_Argument, Filter_Threshold); R.Add_Option ("help", 'h', No_Argument, Help); R.Add_Option ("hash-pkg", 'H', Required_Argument, Output_Hash); R.Add_Option ("jobs", 'j', Required_Argument, Job_Count); R.Add_Option ("sx-dict", 'L', No_Argument, Sx_Dict_Output); R.Add_Option ("min-substring", 'm', Required_Argument, Min_Sub_Size); R.Add_Option ("max-substring", 'M', Required_Argument, Max_Sub_Size); R.Add_Option ("stats", 's', No_Argument, Stat_Output); |
︙ | ︙ | |||
467 468 469 470 471 472 473 474 475 476 477 478 479 480 | Put_Line (Output, Indent & Indent & "Evaluate the dictionary on the input given corpus"); when Options.Job_Count => New_Line (Output); Put_Line (Output, Indent & Indent & "Number of parallel jobs in long calculations"); end case; end loop; end Print_Help; function To_Dictionary (Handler : in Callback'Class; Input : in Natools.Smaz.Tools.String_Lists.List) | > > > > > > > | > > > > > > > | 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 | Put_Line (Output, Indent & Indent & "Evaluate the dictionary on the input given corpus"); when Options.Job_Count => New_Line (Output); Put_Line (Output, Indent & Indent & "Number of parallel jobs in long calculations"); when Options.Filter_Threshold => Put_Line (Output, " <threshold>"); Put_Line (Output, Indent & Indent & "Before building a dictionary from substrings, remove"); Put_Line (Output, Indent & Indent & "substrings whose count is below the threshold."); end case; end loop; end Print_Help; function To_Dictionary (Handler : in Callback'Class; Input : in Natools.Smaz.Tools.String_Lists.List) return Natools.Smaz.Dictionary is use type Natools.Smaz.Tools.String_Count; begin case Handler.Dict_Source is when Dict_Sources.S_Expression => return Natools.Smaz.Tools.To_Dictionary (Input, True); when Dict_Sources.Word_List => declare Counter : Natools.Smaz.Tools.Word_Counter; begin for S of Input loop Natools.Smaz.Tools.Add_Substrings (Counter, S, Handler.Min_Sub_Size, Handler.Max_Sub_Size); if Handler.Max_Word_Size > Handler.Max_Sub_Size then Natools.Smaz.Tools.Add_Words (Counter, S, Handler.Max_Sub_Size + 1, Handler.Max_Word_Size); end if; end loop; if Handler.Filter_Threshold > 0 then Natools.Smaz.Tools.Filter_By_Count (Counter, Handler.Filter_Threshold); end if; return Natools.Smaz.Tools.To_Dictionary (Natools.Smaz.Tools.Simple_Dictionary (Counter, 254), True); end; end case; end To_Dictionary; |
︙ | ︙ |