{
  "_id": "6a1f31dbb401979e73427d8f",
  "Package": "tokenizers",
  "Type": "Package",
  "Title": "Fast, Consistent Tokenization of Natural Language Text",
  "Version": "0.3.1",
  "Date": "2024-03-27",
  "Description": "Convert natural language text into tokens. Includes\ntokenizers for shingled n-grams, skip n-grams, words, word\nstems, sentences, paragraphs, characters, shingled characters,\nlines, Penn Treebank, regular expressions, as well as functions\nfor counting characters, words, and sentences, and a function\nfor splitting longer texts into separate documents, each with\nthe same number of words.  The tokenizers have a consistent\ninterface, and the package is built on the 'stringi' and 'Rcpp'\npackages for fast yet correct tokenization in 'UTF-8'.",
  "License": "MIT + file LICENSE",
  "LazyData": "yes",
  "Authors@R": "c(person(\"Thomas\", \"Charlon\", role = c(\"aut\", \"cre\"),\nemail = \"charlon@protonmail.com\",\ncomment = c(ORCID = \"0000-0001-7497-0470\")),\nperson(\"Lincoln\", \"Mullen\", role = c(\"aut\"),\nemail = \"lincoln@lincolnmullen.com\",\ncomment = c(ORCID = \"0000-0001-5103-6917\")),\nperson(\"Os\", \"Keyes\", role = c(\"ctb\"),\nemail = \"ironholds@gmail.com\",\ncomment = c(ORCID = \"0000-0001-5196-609X\")),\nperson(\"Dmitriy\", \"Selivanov\", role = c(\"ctb\"),\nemail = \"selivanov.dmitriy@gmail.com\"),\nperson(\"Jeffrey\", \"Arnold\", role = c(\"ctb\"),\nemail = \"jeffrey.arnold@gmail.com\",\ncomment = c(ORCID = \"0000-0001-9953-3904\")),\nperson(\"Kenneth\", \"Benoit\", role = c(\"ctb\"),\nemail = \"kbenoit@lse.ac.uk\",\ncomment = c(ORCID = \"0000-0002-0797-564X\")))",
  "URL": "https://docs.ropensci.org/tokenizers/,\nhttps://github.com/ropensci/tokenizers",
  "BugReports": "https://github.com/ropensci/tokenizers/issues",
  "RoxygenNote": "7.3.1",
  "Encoding": "UTF-8",
  "VignetteBuilder": "knitr",
  "Config/pak/sysreqs": "libicu-dev",
  "Repository": "https://ropensci.r-universe.dev",
  "Date/Publication": "2024-03-27 09:33:34 UTC",
  "RemoteUrl": "https://github.com/ropensci/tokenizers",
  "RemoteRef": "master",
  "RemoteSha": "b80863d088d4b39695b602ca11e061ac34770ec7",
  "NeedsCompilation": "yes",
  "Packaged": {
    "Date": "2026-05-15 08:19:23 UTC",
    "User": "root"
  },
  "Author": "Thomas Charlon [aut, cre] (ORCID:\n<https://orcid.org/0000-0001-7497-0470>),\nLincoln Mullen [aut] (ORCID: <https://orcid.org/0000-0001-5103-6917>),\nOs Keyes [ctb] (ORCID: <https://orcid.org/0000-0001-5196-609X>),\nDmitriy Selivanov [ctb],\nJeffrey Arnold [ctb] (ORCID: <https://orcid.org/0000-0001-9953-3904>),\nKenneth Benoit [ctb] (ORCID: <https://orcid.org/0000-0002-0797-564X>)",
  "Maintainer": "Thomas Charlon <charlon@protonmail.com>",
  "MD5sum": "1408a23c5cf855ab4bfcf8554f570e43",
  "_user": "ropensci",
  "_type": "src",
  "_file": "tokenizers_0.3.1.tar.gz",
  "_fileid": "a75228be5670407626ae1ca78e5729a4961e821c505678991c84dd5570beae4b",
  "_filesize": 576578,
  "_sha256": "a75228be5670407626ae1ca78e5729a4961e821c505678991c84dd5570beae4b",
  "_created": "2026-05-15T08:19:23.000Z",
  "_published": "2026-06-02T19:41:15.377Z",
  "_distro": "noble",
  "_jobs": [
    {
      "job": 79156871396,
      "time": 124,
      "config": "linux-devel-arm64",
      "r": "4.7.0",
      "check": "OK",
      "artifact": "7012972468"
    },
    {
      "job": 79156870896,
      "time": 172,
      "config": "linux-devel-x86_64",
      "r": "4.7.0",
      "check": "OK",
      "artifact": "7012983971"
    },
    {
      "job": 79156871599,
      "time": 123,
      "config": "linux-release-arm64",
      "r": "4.6.0",
      "check": "OK",
      "artifact": "7012972082"
    },
    {
      "job": 79156871555,
      "time": 136,
      "config": "linux-release-x86_64",
      "r": "4.6.0",
      "check": "OK",
      "artifact": "7012975512"
    },
    {
      "job": 79156871046,
      "time": 105,
      "config": "macos-oldrel-arm64",
      "r": "4.5.3",
      "check": "OK",
      "artifact": "7012967604"
    },
    {
      "job": 79156871036,
      "time": 179,
      "config": "macos-oldrel-x86_64",
      "r": "4.5.3",
      "check": "OK",
      "artifact": "7012985842"
    },
    {
      "job": 79156871261,
      "time": 93,
      "config": "macos-release-arm64",
      "r": "4.6.0",
      "check": "OK",
      "artifact": "7012965160"
    },
    {
      "job": 79156870880,
      "time": 221,
      "config": "macos-release-x86_64",
      "r": "4.6.0",
      "check": "OK",
      "artifact": "7012995828"
    },
    {
      "job": 79156870465,
      "time": 145,
      "config": "pkgdown",
      "r": "4.6.0",
      "check": "OK",
      "artifact": "7012932471"
    },
    {
      "job": 79156870306,
      "time": 195,
      "config": "source",
      "r": "4.6.0",
      "check": "OK",
      "artifact": "7012943105"
    },
    {
      "job": 79156870466,
      "time": 110,
      "config": "wasm-release",
      "r": "4.6.0",
      "check": "OK",
      "artifact": "7368024673"
    },
    {
      "job": 79156871127,
      "time": 110,
      "config": "windows-devel",
      "r": "4.7.0",
      "check": "OK",
      "artifact": "7012969017"
    },
    {
      "job": 79156871292,
      "time": 91,
      "config": "windows-oldrel",
      "r": "4.5.3",
      "check": "OK",
      "artifact": "7012964806"
    },
    {
      "job": 79156871385,
      "time": 98,
      "config": "windows-release",
      "r": "4.6.0",
      "check": "OK",
      "artifact": "7012966162"
    }
  ],
  "_buildurl": "https://github.com/r-universe/ropensci/actions/runs/25907692309",
  "_status": "success",
  "_host": "GitHub-Actions",
  "_upstream": "https://github.com/ropensci/tokenizers",
  "_commit": {
    "id": "b80863d088d4b39695b602ca11e061ac34770ec7",
    "author": "tcharlon <charlon@protonmail.com>",
    "committer": "tcharlon <charlon@protonmail.com>",
    "message": "new maintainer\n",
    "time": 1711532014
  },
  "_maintainer": {
    "name": "Thomas Charlon",
    "email": "charlon@protonmail.com",
    "login": "thomaschln",
    "linkedin": "in/thomas-charlon-meng-phd-aba0a3275",
    "orcid": "0000-0001-7497-0470",
    "uuid": 2394508
  },
  "_registered": true,
  "_dependencies": [
    {
      "package": "R",
      "version": ">= 3.1.3",
      "role": "Depends"
    },
    {
      "package": "Rcpp",
      "role": "LinkingTo"
    },
    {
      "package": "stringi",
      "version": ">= 1.0.1",
      "role": "Imports"
    },
    {
      "package": "Rcpp",
      "version": ">= 0.12.3",
      "role": "Imports"
    },
    {
      "package": "SnowballC",
      "version": ">= 0.5.1",
      "role": "Imports"
    },
    {
      "package": "covr",
      "role": "Suggests"
    },
    {
      "package": "knitr",
      "role": "Suggests"
    },
    {
      "package": "rmarkdown",
      "role": "Suggests"
    },
    {
      "package": "stopwords",
      "version": ">= 0.9.0",
      "role": "Suggests"
    },
    {
      "package": "testthat",
      "role": "Suggests"
    }
  ],
  "_owner": "ropensci",
  "_selfowned": true,
  "_usedby": 81,
  "_updates": [],
  "_tags": [],
  "_topics": [
    "nlp",
    "peer-reviewed",
    "text-mining",
    "tokenizer",
    "cpp"
  ],
  "_stars": 188,
  "_contributors": [
    {
      "user": "lmullen",
      "count": 175,
      "uuid": 183672
    },
    {
      "user": "dselivanov",
      "count": 6,
      "uuid": 5123805
    },
    {
      "user": "kbenoit",
      "count": 4,
      "uuid": 2182246
    },
    {
      "user": "jrnold",
      "count": 4,
      "uuid": 123968
    },
    {
      "user": "chrismuir",
      "count": 1,
      "uuid": 13386824
    },
    {
      "user": "emilhvitfeldt",
      "count": 1,
      "uuid": 14034784
    },
    {
      "user": "hideaki",
      "count": 1,
      "uuid": 19518
    },
    {
      "user": "jeroen",
      "count": 1,
      "uuid": 216319
    },
    {
      "user": "juliasilge",
      "count": 1,
      "uuid": 12505835
    },
    {
      "user": "karthik",
      "count": 1,
      "uuid": 138494
    },
    {
      "user": "maelle",
      "count": 1,
      "uuid": 8360597
    },
    {
      "user": "ironholds",
      "count": 1,
      "uuid": 2487262
    },
    {
      "user": "thomaschln",
      "count": 1,
      "uuid": 2394508
    }
  ],
  "_userbio": {
    "uuid": 1200269,
    "type": "organization",
    "name": "rOpenSci",
    "description": "Tools and R Packages for Open Science"
  },
  "_downloads": {
    "count": 46162,
    "source": "https://cranlogs.r-pkg.org/downloads/total/last-month/tokenizers"
  },
  "_mentions": 1,
  "_devurl": "https://github.com/ropensci/tokenizers",
  "_pkgdown": "https://docs.ropensci.org/tokenizers/",
  "_searchresults": 1294,
  "_metadata": {
    "review": {
      "id": 33,
      "status": "reviewed",
      "version": "0.1.1",
      "organization": "rOpenSci Software Review",
      "url": "https://github.com/ropensci/software-review/issues/33"
    },
    "ropensci_category": "scalereprod"
  },
  "_rbuild": "4.6.0",
  "_assets": [
    "extra/citation.cff",
    "extra/citation.html",
    "extra/citation.json",
    "extra/citation.txt",
    "extra/contents.json",
    "extra/NEWS.html",
    "extra/NEWS.txt",
    "extra/readme.html",
    "extra/readme.md",
    "extra/tokenizers.html",
    "manual.pdf"
  ],
  "_homeurl": "https://github.com/ropensci/tokenizers",
  "_realowner": "ropensci",
  "_cranurl": true,
  "_releases": [
    {
      "version": "0.1.0",
      "date": "2016-04-02"
    },
    {
      "version": "0.1.1",
      "date": "2016-04-04"
    },
    {
      "version": "0.1.2",
      "date": "2016-04-14"
    },
    {
      "version": "0.1.3",
      "date": "2016-08-18"
    },
    {
      "version": "0.1.4",
      "date": "2016-08-29"
    },
    {
      "version": "0.2.0",
      "date": "2018-03-21"
    },
    {
      "version": "0.2.1",
      "date": "2018-03-29"
    },
    {
      "version": "0.2.3",
      "date": "2022-09-23"
    },
    {
      "version": "0.3.0",
      "date": "2022-12-22"
    }
  ],
  "_exports": [
    "chunk_text",
    "count_characters",
    "count_sentences",
    "count_words",
    "tokenize_character_shingles",
    "tokenize_characters",
    "tokenize_lines",
    "tokenize_ngrams",
    "tokenize_paragraphs",
    "tokenize_ptb",
    "tokenize_regex",
    "tokenize_sentences",
    "tokenize_skip_ngrams",
    "tokenize_word_stems",
    "tokenize_words"
  ],
  "_datasets": [
    {
      "name": "mobydick",
      "title": "The text of Moby Dick",
      "object": "mobydick",
      "class": [
        "character"
      ],
      "fields": [],
      "table": false,
      "tojson": true
    }
  ],
  "_help": [
    {
      "page": "basic-tokenizers",
      "title": "Basic tokenizers",
      "topics": [
        "basic-tokenizers",
        "tokenize_characters",
        "tokenize_lines",
        "tokenize_paragraphs",
        "tokenize_regex",
        "tokenize_sentences",
        "tokenize_words"
      ]
    },
    {
      "page": "chunk_text",
      "title": "Chunk text into smaller segments",
      "topics": [
        "chunk_text"
      ]
    },
    {
      "page": "word-counting",
      "title": "Count words, sentences, characters",
      "topics": [
        "count_characters",
        "count_sentences",
        "count_words"
      ]
    },
    {
      "page": "mobydick",
      "title": "The text of Moby Dick",
      "topics": [
        "mobydick"
      ]
    },
    {
      "page": "ngram-tokenizers",
      "title": "N-gram tokenizers",
      "topics": [
        "ngram-tokenizers",
        "tokenize_ngrams",
        "tokenize_skip_ngrams"
      ]
    },
    {
      "page": "shingle-tokenizers",
      "title": "Character shingle tokenizers",
      "topics": [
        "tokenize_character_shingles"
      ]
    },
    {
      "page": "ptb-tokenizer",
      "title": "Penn Treebank Tokenizer",
      "topics": [
        "tokenize_ptb"
      ]
    },
    {
      "page": "stem-tokenizers",
      "title": "Word stem tokenizer",
      "topics": [
        "tokenize_word_stems"
      ]
    },
    {
      "page": "tokenizers",
      "title": "Tokenizers",
      "topics": [
        "tokenizers-package",
        "tokenizers"
      ]
    }
  ],
  "_readme": "https://github.com/ropensci/tokenizers/raw/master/README.md",
  "_rundeps": [
    "Rcpp",
    "SnowballC",
    "stringi"
  ],
  "_sysdeps": [
    {
      "shlib": "libstdc++",
      "package": "libstdc++6",
      "source": "gcc",
      "version": "14.2.0-4ubuntu2~24.04.1",
      "name": "c++",
      "homepage": "http://gcc.gnu.org/",
      "description": "GNU Standard C++ Library v3"
    }
  ],
  "_vignettes": [
    {
      "source": "introduction-to-tokenizers.Rmd",
      "filename": "introduction-to-tokenizers.html",
      "title": "Introduction to the tokenizers Package",
      "author": "Lincoln Mullen",
      "engine": "knitr::rmarkdown",
      "headings": [
        "Package overview",
        "Character and character-shingle tokenizers",
        "Word and word-stem tokenizers",
        "N-gram and skip n-gram tokenizers",
        "Sentence and paragraph tokenizers",
        "Text chunking",
        "Counting words, characters, sentences"
      ],
      "created": "2016-08-11 20:12:37",
      "modified": "2022-12-19 21:14:10",
      "commits": 12
    },
    {
      "source": "tif-and-tokenizers.Rmd",
      "filename": "tif-and-tokenizers.html",
      "title": "The Text Interchange Formats and the tokenizers Package",
      "author": "Lincoln Mullen",
      "engine": "knitr::rmarkdown",
      "headings": [],
      "created": "2018-03-14 00:10:35",
      "modified": "2022-09-23 18:07:51",
      "commits": 5
    }
  ],
  "_score": 13.515164261900965,
  "_indexed": true,
  "_nocasepkg": "tokenizers",
  "_universes": [
    "ropensci",
    "thomaschln"
  ],
  "_binaries": [
    {
      "r": "4.7.0",
      "os": "linux",
      "version": "0.3.1",
      "date": "2026-05-15T08:21:34.000Z",
      "distro": "noble",
      "arch": "aarch64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "a22469caaa17a7f0860e2f2d4f01580d4ff4eb4e749299fe68616b7faf01523c",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/25907692309"
    },
    {
      "r": "4.7.0",
      "os": "linux",
      "version": "0.3.1",
      "date": "2026-05-15T08:22:18.000Z",
      "distro": "noble",
      "arch": "x86_64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "3216e225e576f4b452398abf22d40008e5dd033d8c732f06ea83607450590aa1",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/25907692309"
    },
    {
      "r": "4.6.0",
      "os": "linux",
      "version": "0.3.1",
      "date": "2026-05-15T08:21:34.000Z",
      "distro": "noble",
      "arch": "aarch64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "1942ad9dca025d39e4b95ab1471d602ab1c9166d0803fd376a7120824b0b2f75",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/25907692309"
    },
    {
      "r": "4.6.0",
      "os": "linux",
      "version": "0.3.1",
      "date": "2026-05-15T08:21:43.000Z",
      "distro": "noble",
      "arch": "x86_64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "a6e88ecdb148950201c64f0e3633362d8eb6e41c20191c58cf82ec8db0ac754a",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/25907692309"
    },
    {
      "r": "4.5.3",
      "os": "mac",
      "version": "0.3.1",
      "date": "2026-05-15T08:21:15.000Z",
      "arch": "aarch64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "df761e018eb1c795e61db761f712bc13adbae8e59985ccc63d956822e3c8373b",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/25907692309"
    },
    {
      "r": "4.5.3",
      "os": "mac",
      "version": "0.3.1",
      "date": "2026-05-15T08:22:07.000Z",
      "arch": "x86_64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "21ccd4547007a48b85ebc97ca425796c7c6e4b18deba5bc99ee68fe0e160e1af",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/25907692309"
    },
    {
      "r": "4.6.0",
      "os": "mac",
      "version": "0.3.1",
      "date": "2026-05-15T08:21:04.000Z",
      "arch": "aarch64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "650b5d43cac0c982dd11e1150cd3d4fe313cae1cb5f17c6ddb7e47d7929585bc",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/25907692309"
    },
    {
      "r": "4.6.0",
      "os": "mac",
      "version": "0.3.1",
      "date": "2026-05-15T08:22:24.000Z",
      "arch": "x86_64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "5e60494573cfd2be331a7ac508a18879504263016edcbcd6d90be2e407370703",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/25907692309"
    },
    {
      "r": "4.7.0",
      "os": "win",
      "version": "0.3.1",
      "date": "2026-05-15T08:20:55.000Z",
      "arch": "x86_64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "b90ab81f25fefc484b12f5d7a65d00bbc3383ac93f1d6958f8fc6f21d3e1cd00",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/25907692309"
    },
    {
      "r": "4.5.3",
      "os": "win",
      "version": "0.3.1",
      "date": "2026-05-15T08:20:41.000Z",
      "arch": "x86_64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "1e4ee2d2342604a1db25369261d412aaec01083d55cebbc2f398f94e52bec8d6",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/25907692309"
    },
    {
      "r": "4.6.0",
      "os": "win",
      "version": "0.3.1",
      "date": "2026-05-15T08:20:44.000Z",
      "arch": "x86_64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "8b4e4b6122597e89d2d04ef8e97416d41a92ece4b3a2a26a4321dd579477d173",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/25907692309"
    },
    {
      "r": "4.6.0",
      "os": "wasm",
      "version": "0.3.1",
      "date": "2026-06-02T19:40:40.000Z",
      "arch": "emscripten",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "c662095a9eb234ccbee87116650385b6c9ac84e0d5fc937ea52fd86c2e9a4e11",
      "status": "success",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/25907692309"
    }
  ]
}