opuscleaner

View on PyPIReverse Dependencies (0)

0.4.4 opuscleaner-0.4.4-py3-none-any.whl

Wheel Details

Project: opuscleaner
Version: 0.4.4
Filename: opuscleaner-0.4.4-py3-none-any.whl
Download: [link]
Size: 342072
MD5: de84442ae1602186f1d3bf406d7c8083
SHA256: b9883928edbb7135dd9a9a90cabb3c540a3d7f5e69d42298a575254087ccea8e
Uploaded: 2025-01-16 16:05:31 +0000

dist-info

METADATA

Metadata-Version: 2.4
Name: opuscleaner
Version: 0.4.4
Author-Email: Jelmer van der Linde <jelmer[at]ikhoefgeen.nl>
Project-Url: Documentation, https://github.com/hplt-project/opuscleaner#readme
Project-Url: Issues, https://github.com/hplt-project/opuscleaner/issues
Project-Url: Source, https://github.com/hplt-project/opuscleaner
Classifier: Development Status :: 4 - Beta
Classifier: Programming Language :: Python
Classifier: Programming Language :: Python :: 3.9
Classifier: Programming Language :: Python :: 3.10
Classifier: Programming Language :: Python :: 3.11
Classifier: Programming Language :: Python :: Implementation :: CPython
Classifier: Programming Language :: Python :: Implementation :: PyPy
Requires-Python: >=3.9
Requires-Dist: fastapi (==0.78.0)
Requires-Dist: pyyaml (>=6.0.1)
Requires-Dist: uvicorn (==0.20.0)
Requires-Dist: xxhash (==3.2.0)
Requires-Dist: bicleaner; extra == "all"
Requires-Dist: bifixer; extra == "all"
Requires-Dist: fasttext; extra == "all"
Requires-Dist: fugashi[unidic-lite] (==1.1.2); extra == "all"
Requires-Dist: hanziconv (==0.3.2); extra == "all"
Requires-Dist: laserembeddings[ja,zh] (==1.1.2); extra == "all"
Requires-Dist: more-itertools; extra == "all"
Requires-Dist: opusfilter (==2.6.0); extra == "all"
Requires-Dist: pycld2 (==0.41); extra == "all"
Requires-Dist: requests; extra == "all"
Requires-Dist: sacremoses; extra == "all"
Requires-Dist: spacy-pkuseg; extra == "all"
Provides-Extra: all
Description-Content-Type: text/markdown
License-Expression: MIT
[Description omitted; length: 5392 characters]

WHEEL

Wheel-Version: 1.0
Generator: hatchling 1.27.0
Root-Is-Purelib: true
Tag: py3-none-any

RECORD

Path Digest Size
opuscleaner/__about__.py sha256=MGDNnKvehUI8OWjdoOHii-gwj0FYKZexEwWZYzqPbhY 18
opuscleaner/__init__.py sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU 0
opuscleaner/_util.py sha256=hLgqtZ4WwElrlqByJjmnaiBb2KbEUtHPTzuWCJTmxTY 3500
opuscleaner/categories.py sha256=NnVJEtWwWi1rNer6L0pWxxrCypD9Hlq0nF_vmfKMoiY 1707
opuscleaner/clean.py sha256=Dy_mIE0_ipZ_2bmtptbpqzfU3Xf40uOnoedfJ73p-Rg 26758
opuscleaner/col.py sha256=SlzLRRgqZWUUMNVTXTjPCX5EgbW0IZYxuiyNUoeg4sY 3163
opuscleaner/config.py sha256=DrxOxIpQYnvDSsm-adl-a07_w_bkTl3O1oO1pjvm2ws 1490
opuscleaner/datasets.py sha256=WluDKWcL9seLsh34Ta-9j6tWBy4Pdr43y5Qq7wj7aG8 4525
opuscleaner/download.py sha256=HqYnOmd4EW9_FfSdhAMB7qnpdO-wEuARMH7DX9Efxy0 13710
opuscleaner/filters.py sha256=FD3tqwpAzLuBt544VS194wfymXnCUWXHxEYeC0A18gM 7901
opuscleaner/logging.py sha256=CnRmBEtlPrtQI0bjKPfc0SxGJmqAV0QmA6RVHxn-OxA 7339
opuscleaner/opusfilter_compat.py sha256=pb1MjCyKmYKlvogm_Y2fVyRDmyrFdegpm1nmRAgo7Lc 6452
opuscleaner/sample.py sha256=eXv9_BdZPL4Nx32xLWV2TzRMXjOUWtmkuVCSRixCmAw 4794
opuscleaner/server.py sha256=nE4Kk3zQIrRREgsvgCUTLQfr0YtlxpECz5S2pd4QHV0 15077
opuscleaner/threshold.py sha256=OBHTNV2IHrWCtxj7fILHsNTUrpCiMbvxPdBuSMHouAA 8153
opuscleaner/filters/alpha_ratio.json sha256=B-8P7D4_bh1BHiPrnohHXNsKlr1fHoQR2006a0ikhWI 3699
opuscleaner/filters/alpha_ratio.py sha256=L2PoRbd9jQUqX6Z-eZkqKQ9y2QzZDKJg4cc68nGQu5c 4288
opuscleaner/filters/bicleaner_hardrules.json sha256=PLZm502DA1FGuFz9baJcwvaQFyRKcNUuVtmlsj_PMPI 1261
opuscleaner/filters/bifixer.json sha256=k5SoI1PupsJEJhDf5mZ6GPjFap6kCxrgPV_SwV8OYrY 3714
opuscleaner/filters/bifixer_dedupe.py sha256=hp6vs3pFZ7FFioSbPfjLOWWukeo9udCnX6mfoKyJ4mk 454
opuscleaner/filters/clean_common.py sha256=4iFbW069hg6xInlIGK06FQ1f22O6YAMECRPlwCyZGh0 4263
opuscleaner/filters/deescape-special-chars.json sha256=aSnzlrENPRaoI4Z-94sRj1EWLsGmT7hvTgJs0KwJsug 496
opuscleaner/filters/deescape-special-chars.perl sha256=YrUz1T7J91dGJHngP3oUksnAej2f_0ps1GJR5HwUscI 631
opuscleaner/filters/deescape_tsv.json sha256=cRrF55vUpp2lbThJ2zDsxwMhhOVkGqb-WHjjgZS57xA 190
opuscleaner/filters/deescape_tsv.py sha256=Gi0kTOdv-4CyeG1vzhhIsDPg_QHYuYUtZgheQ2NZgb8 363
opuscleaner/filters/detokenizer.json sha256=iMsyKuh0IoIAMwzjsV9pdGy7J-96gSNtK_w7a9P_d_g 1232
opuscleaner/filters/detokenizer.perl sha256=hnz-0bP9zu_moUhamYZu5ZIGXvtg1Rx0GLowQWb4liw 12473
opuscleaner/filters/fasttext_filter.json sha256=JX9U08q9ouDDWc-dr7fB4RR3mM467KVIqzoTHSG0O4c 8692
opuscleaner/filters/fasttext_filter.py sha256=rczY_wI2DJgZWxYuvLdmmobM6WwdqleVikoWFhb8Xn0 3173
opuscleaner/filters/fix_elitr_eca.json sha256=WpUpFsA2iLEdpETVr5nNUuv5GvaQ67VCAr6FtjUL9TU 180
opuscleaner/filters/fix_elitr_eca.py sha256=X4eAKNz8bvgIuWlHfCopvWHESjpUmlMyHW86w5jExyI 1473
opuscleaner/filters/fix_quotes.json sha256=AYf3yOd8UO5gC9vTTVSO4vpF3UNeqqapYF7pYnxm9hI 116
opuscleaner/filters/fix_quotes.py sha256=xcH-Gc6cFrSWzhUMrqDOl1Ih6J8EN4camWyS5_TcNUs 322
opuscleaner/filters/fix_sent_final_punct.json sha256=z23-yiglyJ5u7E5emJtfr4CsscYONiFTFErrGguLTj4 299
opuscleaner/filters/fix_sent_final_punct.py sha256=7l5vfoJcQXAGJeGEs7L98hQrBLbX-KgtnlNKxksuKCw 2191
opuscleaner/filters/fix_un_chinese.json sha256=6uQ9UBASDSiR6MK9KfrOTwE246skhOr4T_rvRm--6p4 218
opuscleaner/filters/fix_un_chinese.py sha256=iBrtt0T0sBcx1zEsnryTUI7BT4K1vqijg7_qRTUi2_k 354
opuscleaner/filters/fix_wiki.json sha256=Cxud6fNmB_n1xTwZQoT-GiAYL862hXz8Q9m9TsYZHm8 1344
opuscleaner/filters/fix_wiki.py sha256=R6zcrBjEKNLL7nRBoD1LDFYbB-qUVw7e2krDNYELxUA 3569
opuscleaner/filters/langid.json sha256=X5PQVJI3GlgWM57ujdvKQskeyXvcHD0M_ijZDIJZSWA 689
opuscleaner/filters/langid.py sha256=KYPc_2grpqs1Q439rV3iPPYP2KPOytKMbsAyVNHLiUc 3019
opuscleaner/filters/laser_similarity.json sha256=Ip8UqavUb0Ul_QpwNdMxTklfyYDlb0fgPkFAvvrXrYU 853
opuscleaner/filters/laser_similarity.py sha256=QPTHjs40ffGxKfbUsNa2X06IMbXpbKD18OpkPrI81Kc 4769
opuscleaner/filters/max_length.json sha256=lMKoVux4rsqZnX3L6BzsH9hGYVR2cE7darWXY5hamkM 452
opuscleaner/filters/max_length.py sha256=DbQuoVfwkvVSCpfuTU2GWe7-DTIBM-V2sd5I2AYd_1M 1545
opuscleaner/filters/max_word_length.json sha256=uKYYTF2hvP5_WoNF33Q1D1zlMJwg4h89Uayp-pZ2Q-8 446
opuscleaner/filters/max_word_length.py sha256=fInOT6qosziYL4tnWO5lIRgBHed5KAi0ckjg5HK0-W4 869
opuscleaner/filters/normalize_whitespace.json sha256=XwMP9kpaeGdqnK8qqxO2eopEHdZa1G6zCxk2S7ecf9E 325
opuscleaner/filters/normalize_whitespace.py sha256=WMuwWODLL6LgkPemnL8DYb3P_bbSLGdmnnLVswiib8A 806
opuscleaner/filters/num_mismatch.json sha256=XDwhewISTNxeEoi0aPIVWi4Bwqgc5toa6uUoUCbB0I8 493
opuscleaner/filters/num_mismatch.py sha256=ojIpv6OygCBWD42tsyY5dGL2TrTrWOuSqYU4nFmxoIQ 1868
opuscleaner/filters/regexp.json sha256=xXySs9Jlh5a1CZMk_K6o2rQtmgVYtumtZ_66lO3AnXo 216
opuscleaner/filters/remove_empty_lines.json sha256=NsBSbv_SFCYBFPstsQm5tJhJL3cKRwhBQFGbmGaiVvQ 176
opuscleaner/filters/remove_empty_lines.py sha256=XUQeK8ce6y9rjR5B9kayeeD__UmtHZI8fPj5kJttJ0w 341
opuscleaner/filters/remove_frequent_patterns.json sha256=FwZNO5kbpny-v-sm9oVRCYKSg-r_jvOmtNsMaElsssI 402
opuscleaner/filters/remove_frequent_patterns.py sha256=2zBx3qZ88CqKXyKuT-yYDb0w-1_1s8C4aepsdwYTm50 2700
opuscleaner/filters/remove_frequent_patterns.txt sha256=H1L0x6sbKI-TazzJW_5AP809StDh7_vDwyfMLTCmEOo 417
opuscleaner/filters/sed.json sha256=qJIA6oqGywsrW7saokrh3R6ydJAeW8jfUk_D_f8IQQw 335
opuscleaner/filters/segment_chinese.json sha256=l_yNMmSLFandBvCpiYfIgKEEv4BoM3nO3qmqOpv2CvI 240
opuscleaner/filters/segment_chinese.py sha256=XYM0ARBFNcCKphVgzElp9hPe5OSskK6uRmhDrgLcAWo 200
opuscleaner/filters/segment_japanese.json sha256=_cLMEMueM6ZrNjMhX8Qpe2h7yDtekdUhrv4lqNpvpeY 432
opuscleaner/filters/segment_japanese.py sha256=hMTkxWTXiJPZm98PjziXBfMvRtgebwpkMvOXTBL9cZ8 588
opuscleaner/filters/simplify_chinese.json sha256=OZVuscTAsg2NGp6ASQ_EzeN6mODE2A4BpUAFypvBJak 154
opuscleaner/filters/split_sentences.json sha256=OjcJMMj15oCIdrDFrH65uUhNL9ZbskdLtyWBYAh0TrE 576
opuscleaner/filters/split_sentences.py sha256=EkRHf93Zb4UWTo9iPdwy473DdyMEe_Vk8t_REd1Jnmg 1059
opuscleaner/filters/src_trg_ratio.json sha256=41Hy5zGMq9hHtz0UHCIRtdak1E6BrP7BJKXty7WdB20 495
opuscleaner/filters/src_trg_ratio.py sha256=rsRWlgptlcbXae0uEqAnW3JomfD0i2iqKwl5C3l6z-I 2298
opuscleaner/filters/strip_suffix.json sha256=KY7B97tYIb_wKTJbxyGaigiRct2VjY2H9hmNIfazKv4 774
opuscleaner/filters/strip_suffix.py sha256=qHLn344hsO4OLbga_86d77Yam71T9CaQ_u5muGQrMX8 2271
opuscleaner/filters/test_num_mismatch.py sha256=6bU03Wf3HBMz7q2y1EhyRoENZ8A9Q1le_CguEFgu5rQ 2186
opuscleaner/filters/traditionalise_chinese.json sha256=uTNwESPeljvtSB1jRrqGIo4BvyVbP-A_vErNpU_E0rM 152
opuscleaner/filters/opusfilter/AlphabetRatioFilter.json sha256=hZTweQFPfRltkuT9q568bUYLq_JwVKwl5n0G--35ffI 469
opuscleaner/filters/opusfilter/AverageWordLengthFilter.json sha256=z3cnYV_vg-3LEBFZfFW5tjQg8N_UOMKrBAeZB_sNvXw 633
opuscleaner/filters/opusfilter/CharacterScoreFilter.json sha256=rvrDYrS8y0KUg787-XzJQBS0L6BoQSjN6vYJwd5f2Do 4296
opuscleaner/filters/opusfilter/Detokenizer.json sha256=ahxhKkLh872Zk7_Pc_LXuMfe4OstmqqWSGsx1oZMfhg 1127
opuscleaner/filters/opusfilter/HtmlTagFilter.json sha256=e8qbYc4TDOUI1j3nq5l3tV67OZDdTf1OOCDje5ZMxsk 228
opuscleaner/filters/opusfilter/LengthFilter.json sha256=SevJVFdM53LFcvZJPQyUv6gCfu9bDDtXFwEKWQqvuFo 673
opuscleaner/filters/opusfilter/LengthRatioFilter.json sha256=S-ngrz5860Gk9KZV1vL4amsoJEKjeaHxRl_awxRG00g 471
opuscleaner/filters/opusfilter/LongWordFilter.json sha256=TDh_jEZAJncLxnn3NRwhwOmTcfdrfkyZoFF-4ahuzbI 323
opuscleaner/filters/opusfilter/RegExpFilter.json sha256=2ifcQteGJ310kqZ3MrXggCd65uY6sSOOE8MZR94S_ec 1049
opuscleaner/filters/opusfilter/RegExpSub.json sha256=53N3qGVQ4vfNRz0XoNpTN5p2zlN6ej22Se18SAKTt2E 1039
opuscleaner/filters/opusfilter/Tokenizer.json sha256=uZwMASJHWS6mI0coZozIpiSz0nEWY4qQuFxQqPj0_DM 1121
opuscleaner/filters/opusfilter/WhitespaceNormalizer.json sha256=bx7knXB0j95ezZ-EU370D8Jo2g7rOuGIYzm18hc3gjA 375
opuscleaner/filters/opusfilter/opusfilter-ersatz.py sha256=SbbhREz6iPInezvHquC-PzwmA4pVDXt_JQKuwjwXN8I 1808
opuscleaner/frontend/index.html sha256=aTQS1w2MRMyOW58pdR7zE0QQxto7E1Km7Q6BlLGwdKY 402
opuscleaner/frontend/assets/AddDatasetView.225be141.js sha256=GN9yH49g7bFIafMZGFY6H4kZD8kceSsv_BV42IsdJLA 8937
opuscleaner/frontend/assets/AddDatasetView.2e3a667a.css sha256=LjpmetBcZP7hYSCjby6_cek_XCqQDhQoLaDZX3KfIxY 3663
opuscleaner/frontend/assets/EditFiltersView.5c98ba92.js sha256=6KW-y7Pvgwxy9eUc5yRIIlcU1t6B7hI41KSTADqcm2s 121718
opuscleaner/frontend/assets/EditFiltersView.a241f96f.css sha256=okH5b4ZNFEWedWsWS6J13n-jCv7401ppAN358a3aYss 8179
opuscleaner/frontend/assets/EditFiltersYamlView.b2069b0f.js sha256=oVpwHA2HtxchaSE3K11K6R5EVbOCQtmFKp_W0mnxXEQ 444
opuscleaner/frontend/assets/ListDatasetsView.08ea4530.css sha256=COpFMNGWKt-aLnmQQBVv_wFTiutaOGMR_wHZ5lFPI3w 1220
opuscleaner/frontend/assets/ListDatasetsView.1fa8feb9.js sha256=EMxsgZIrrZ2Hd8weoJyzZuqHtAiTEY56i845kSOsi7g 2330
opuscleaner/frontend/assets/TagsEditor.1c7a5eaf.js sha256=yhVFCzKzmjkbeaBaMISuo3DvOH2CGNOPgMqTLKxFKkU 6270
opuscleaner/frontend/assets/TagsEditor.ccc03a15.css sha256=zMA6FSeOsxjbHkqc4EuHjICX3hCc3G_imTHiI-rZVRI 996
opuscleaner/frontend/assets/data-cuate.84693c76.svg sha256=hGk8dsFT9ol4qwhR8QA-I_HbCSpVlujM2BBhih7dP7k 38943
opuscleaner/frontend/assets/datailor-logo.fbfa6008.svg sha256=-_pgCN60AlJI95UA8O3hCMuJ8CqBRNOvqjtfN1F_3zI 5490
opuscleaner/frontend/assets/eu.24cff2c1.png sha256=JM_yweLbcVsFzdYhtwJQkB6EH4afWdsB6bswGiLeHzk 22166
opuscleaner/frontend/assets/hacks.9bee8b8e.js sha256=gdws0rT4qp9D0DMfiztYIC4h5N8Vd8qZ-ssUaWnxjFY 298
opuscleaner/frontend/assets/horizon-europe.80625b0c.png sha256=gGJbDBNHpwv59Y7YqZkxFsgifPeBK7BQYwJ7UuCX80U 101750
opuscleaner/frontend/assets/index.6cae667b.css sha256=bK5me3hoFvjcCvZZeNyq-GbWpQnn4lmKpj8hmpcs5Vg 2148
opuscleaner/frontend/assets/index.bf830d1b.js sha256=XIvWNDCO4rtO8ERVt09-CdWCNGB6dXIdo68ZSd4T7Xs 129136
opuscleaner/frontend/assets/vue-select.a6a4a76b.js sha256=UDl-MPxPHQrVCDbsJgnPSWwqoCsCAG-5HbQn0W9AuWQ 17440
opuscleaner/frontend/assets/vue-select.b0fac2a1.css sha256=sPrCoW4W5qxSFUFyi24DqF4F_2ivs71AiB06KZJ6uCY 7480
opuscleaner-0.4.4.dist-info/METADATA sha256=JdYbnUHZZGVRC7hDVYcU7qfWgm8pVvAqR7w6lHqA1VI 6885
opuscleaner-0.4.4.dist-info/WHEEL sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ 87
opuscleaner-0.4.4.dist-info/entry_points.txt sha256=-MNJFGFJr1YH6awIxFveoXdJAZedx-2TmBXnmRCg8oU 339
opuscleaner-0.4.4.dist-info/RECORD

entry_points.txt

opuscleaner-clean = opuscleaner.clean:main
opuscleaner-col = opuscleaner.col:main
opuscleaner-datasets = opuscleaner.datasets:main
opuscleaner-download = opuscleaner.download:main
opuscleaner-sample = opuscleaner.sample:main
opuscleaner-server = opuscleaner.server:main
opuscleaner-threshold = opuscleaner.threshold:main