{
  "_id": "6a44ce176ff6f2357551b9d6",
  "Package": "tokenizers",
  "Type": "Package",
  "Title": "Fast, Consistent Tokenization of Natural Language Text",
  "Version": "0.3.1",
  "Date": "2024-03-27",
  "Description": "Convert natural language text into tokens. Includes\ntokenizers for shingled n-grams, skip n-grams, words, word\nstems, sentences, paragraphs, characters, shingled characters,\nlines, Penn Treebank, regular expressions, as well as functions\nfor counting characters, words, and sentences, and a function\nfor splitting longer texts into separate documents, each with\nthe same number of words.  The tokenizers have a consistent\ninterface, and the package is built on the 'stringi' and 'Rcpp'\npackages for fast yet correct tokenization in 'UTF-8'.",
  "License": "MIT + file LICENSE",
  "LazyData": "yes",
  "Authors@R": "c(person(\"Thomas\", \"Charlon\", role = c(\"aut\", \"cre\"),\nemail = \"charlon@protonmail.com\",\ncomment = c(ORCID = \"0000-0001-7497-0470\")),\nperson(\"Lincoln\", \"Mullen\", role = c(\"aut\"),\nemail = \"lincoln@lincolnmullen.com\",\ncomment = c(ORCID = \"0000-0001-5103-6917\")),\nperson(\"Os\", \"Keyes\", role = c(\"ctb\"),\nemail = \"ironholds@gmail.com\",\ncomment = c(ORCID = \"0000-0001-5196-609X\")),\nperson(\"Dmitriy\", \"Selivanov\", role = c(\"ctb\"),\nemail = \"selivanov.dmitriy@gmail.com\"),\nperson(\"Jeffrey\", \"Arnold\", role = c(\"ctb\"),\nemail = \"jeffrey.arnold@gmail.com\",\ncomment = c(ORCID = \"0000-0001-9953-3904\")),\nperson(\"Kenneth\", \"Benoit\", role = c(\"ctb\"),\nemail = \"kbenoit@lse.ac.uk\",\ncomment = c(ORCID = \"0000-0002-0797-564X\")))",
  "URL": "https://docs.ropensci.org/tokenizers/,\nhttps://github.com/ropensci/tokenizers",
  "BugReports": "https://github.com/ropensci/tokenizers/issues",
  "RoxygenNote": "7.3.1",
  "Encoding": "UTF-8",
  "VignetteBuilder": "knitr",
  "Config/pak/sysreqs": "libicu-dev",
  "Repository": "https://ropensci.r-universe.dev",
  "Date/Publication": "2024-03-27 09:33:34 UTC",
  "RemoteUrl": "https://github.com/ropensci/tokenizers",
  "RemoteRef": "master",
  "RemoteSha": "b80863d088d4b39695b602ca11e061ac34770ec7",
  "NeedsCompilation": "yes",
  "Packaged": {
    "Date": "2026-07-01 08:14:52 UTC",
    "User": "root"
  },
  "Author": "Thomas Charlon [aut, cre] (ORCID:\n<https://orcid.org/0000-0001-7497-0470>),\nLincoln Mullen [aut] (ORCID: <https://orcid.org/0000-0001-5103-6917>),\nOs Keyes [ctb] (ORCID: <https://orcid.org/0000-0001-5196-609X>),\nDmitriy Selivanov [ctb],\nJeffrey Arnold [ctb] (ORCID: <https://orcid.org/0000-0001-9953-3904>),\nKenneth Benoit [ctb] (ORCID: <https://orcid.org/0000-0002-0797-564X>)",
  "Maintainer": "Thomas Charlon <charlon@protonmail.com>",
  "_user": "ropensci",
  "_type": "src",
  "_file": "tokenizers_0.3.1.tar.gz",
  "_fileid": "https://r2.ropensci.org/fe320d7adc4535a8118f420bad63fb0a56fdfb656ecfbe04a0a76963c59a713d",
  "_filesize": 576611,
  "_sha256": "fe320d7adc4535a8118f420bad63fb0a56fdfb656ecfbe04a0a76963c59a713d",
  "_expires": "2026-10-09T08:21:42.000Z",
  "_created": "2026-07-01T08:14:52.000Z",
  "_published": "2026-07-01T08:21:43.109Z",
  "_jobs": [
    {
      "job": 84486188289,
      "time": 157,
      "config": "linux-devel-arm64",
      "r": "4.7.0",
      "check": "OK",
      "artifact": "8004059524"
    },
    {
      "job": 84486188310,
      "time": 150,
      "config": "linux-devel-x86_64",
      "r": "4.7.0",
      "check": "OK",
      "artifact": "8004055857"
    },
    {
      "job": 84486188277,
      "time": 152,
      "config": "linux-release-arm64",
      "r": "4.6.1",
      "check": "OK",
      "artifact": "8004058281"
    },
    {
      "job": 84486188312,
      "time": 169,
      "config": "linux-release-x86_64",
      "r": "4.6.1",
      "check": "OK",
      "artifact": "8004063181"
    },
    {
      "job": 84486188358,
      "time": 122,
      "config": "macos-oldrel-arm64",
      "r": "4.5.3",
      "check": "OK",
      "artifact": "8004064576"
    },
    {
      "job": 84486188232,
      "time": 186,
      "config": "macos-oldrel-x86_64",
      "r": "4.5.3",
      "check": "OK",
      "artifact": "8004082258"
    },
    {
      "job": 84486188333,
      "time": 118,
      "config": "macos-release-arm64",
      "r": "4.6.1",
      "check": "OK",
      "artifact": "8004062494"
    },
    {
      "job": 84486188226,
      "time": 227,
      "config": "macos-release-x86_64",
      "r": "4.6.1",
      "check": "OK",
      "artifact": "8004095615"
    },
    {
      "job": 84485541959,
      "time": 151,
      "config": "pkgdown",
      "r": "4.6.0",
      "check": "OK",
      "artifact": "8003978687"
    },
    {
      "job": 84485541952,
      "time": 213,
      "config": "source",
      "r": "4.6.1",
      "check": "OK",
      "artifact": "8004001021"
    },
    {
      "job": 84486188265,
      "time": 148,
      "config": "wasm-release",
      "r": "4.6.0",
      "check": "OK",
      "artifact": "8004055669"
    },
    {
      "job": 84486188235,
      "time": 114,
      "config": "windows-devel",
      "r": "4.7.0",
      "check": "OK",
      "artifact": "8004043162"
    },
    {
      "job": 84486188375,
      "time": 105,
      "config": "windows-oldrel",
      "r": "4.5.3",
      "check": "OK",
      "artifact": "8004039739"
    },
    {
      "job": 84486188302,
      "time": 104,
      "config": "windows-release",
      "r": "4.6.1",
      "check": "OK",
      "artifact": "8004040009"
    }
  ],
  "_host": "GitHub-Actions",
  "_buildurl": "https://github.com/r-universe/ropensci/actions/runs/28503290646",
  "_status": "success",
  "_upstream": "https://github.com/ropensci/tokenizers",
  "_commit": {
    "id": "b80863d088d4b39695b602ca11e061ac34770ec7",
    "author": "tcharlon <charlon@protonmail.com>",
    "committer": "tcharlon <charlon@protonmail.com>",
    "message": "new maintainer\n",
    "time": 1711532014
  },
  "_maintainer": {
    "name": "Thomas Charlon",
    "email": "charlon@protonmail.com",
    "login": "thomaschln",
    "linkedin": "in/thomas-charlon-meng-phd-aba0a3275",
    "orcid": "0000-0001-7497-0470",
    "description": "Harvard Medical School Researcher\nBiomedical Informatics @hms-dbmi\nCELEHS laboratory @celehs",
    "uuid": 2394508
  },
  "_distro": "resolute",
  "_registered": true,
  "_dependencies": [
    {
      "package": "R",
      "version": ">= 3.1.3",
      "role": "Depends"
    },
    {
      "package": "Rcpp",
      "role": "LinkingTo"
    },
    {
      "package": "stringi",
      "version": ">= 1.0.1",
      "role": "Imports"
    },
    {
      "package": "Rcpp",
      "version": ">= 0.12.3",
      "role": "Imports"
    },
    {
      "package": "SnowballC",
      "version": ">= 0.5.1",
      "role": "Imports"
    },
    {
      "package": "covr",
      "role": "Suggests"
    },
    {
      "package": "knitr",
      "role": "Suggests"
    },
    {
      "package": "rmarkdown",
      "role": "Suggests"
    },
    {
      "package": "stopwords",
      "version": ">= 0.9.0",
      "role": "Suggests"
    },
    {
      "package": "testthat",
      "role": "Suggests"
    }
  ],
  "_owner": "ropensci",
  "_selfowned": true,
  "_usedby": 81,
  "_updates": [],
  "_tags": [],
  "_topics": [
    "nlp",
    "peer-reviewed",
    "text-mining",
    "tokenizer",
    "cpp"
  ],
  "_stars": 188,
  "_contributors": [
    {
      "user": "lmullen",
      "count": 175,
      "uuid": 183672
    },
    {
      "user": "dselivanov",
      "count": 6,
      "uuid": 5123805
    },
    {
      "user": "kbenoit",
      "count": 4,
      "uuid": 2182246
    },
    {
      "user": "jrnold",
      "count": 4,
      "uuid": 123968
    },
    {
      "user": "chrismuir",
      "count": 1,
      "uuid": 13386824
    },
    {
      "user": "emilhvitfeldt",
      "count": 1,
      "uuid": 14034784
    },
    {
      "user": "hideaki",
      "count": 1,
      "uuid": 19518
    },
    {
      "user": "jeroen",
      "count": 1,
      "uuid": 216319
    },
    {
      "user": "juliasilge",
      "count": 1,
      "uuid": 12505835
    },
    {
      "user": "karthik",
      "count": 1,
      "uuid": 138494
    },
    {
      "user": "maelle",
      "count": 1,
      "uuid": 8360597
    },
    {
      "user": "ironholds",
      "count": 1,
      "uuid": 2487262
    },
    {
      "user": "thomaschln",
      "count": 1,
      "uuid": 2394508
    }
  ],
  "_userbio": {
    "uuid": 1200269,
    "type": "organization",
    "name": "rOpenSci",
    "followers": 1106,
    "description": "Tools and R Packages for Open Science"
  },
  "_downloads": {
    "count": 33523,
    "source": "https://cranlogs.r-pkg.org/downloads/total/last-month/tokenizers"
  },
  "_mentions": 1,
  "_devurl": "https://github.com/ropensci/tokenizers",
  "_pkgdown": "https://docs.ropensci.org/tokenizers/",
  "_searchresults": 1544,
  "_metadata": {
    "review": {
      "id": 33,
      "status": "reviewed",
      "version": "0.1.1",
      "organization": "rOpenSci Software Review",
      "url": "https://github.com/ropensci/software-review/issues/33"
    },
    "ropensci_category": "scalereprod"
  },
  "_rbuild": "4.6.1",
  "_assets": [
    "extra/citation.cff",
    "extra/citation.html",
    "extra/citation.json",
    "extra/citation.txt",
    "extra/contents.json",
    "extra/NEWS.html",
    "extra/NEWS.txt",
    "extra/readme.html",
    "extra/readme.md",
    "extra/tokenizers.html",
    "LICENSE",
    "manual.pdf"
  ],
  "_homeurl": "https://github.com/ropensci/tokenizers",
  "_realowner": "ropensci",
  "_cranurl": true,
  "_releases": [
    {
      "version": "0.1.0",
      "date": "2016-04-02"
    },
    {
      "version": "0.1.1",
      "date": "2016-04-04"
    },
    {
      "version": "0.1.2",
      "date": "2016-04-14"
    },
    {
      "version": "0.1.3",
      "date": "2016-08-18"
    },
    {
      "version": "0.1.4",
      "date": "2016-08-29"
    },
    {
      "version": "0.2.0",
      "date": "2018-03-21"
    },
    {
      "version": "0.2.1",
      "date": "2018-03-29"
    },
    {
      "version": "0.2.3",
      "date": "2022-09-23"
    },
    {
      "version": "0.3.0",
      "date": "2022-12-22"
    }
  ],
  "_exports": [
    "chunk_text",
    "count_characters",
    "count_sentences",
    "count_words",
    "tokenize_character_shingles",
    "tokenize_characters",
    "tokenize_lines",
    "tokenize_ngrams",
    "tokenize_paragraphs",
    "tokenize_ptb",
    "tokenize_regex",
    "tokenize_sentences",
    "tokenize_skip_ngrams",
    "tokenize_word_stems",
    "tokenize_words"
  ],
  "_datasets": [
    {
      "name": "mobydick",
      "title": "The text of Moby Dick",
      "object": "mobydick",
      "class": [
        "character"
      ],
      "fields": [],
      "table": false,
      "tojson": true
    }
  ],
  "_help": [
    {
      "page": "basic-tokenizers",
      "title": "Basic tokenizers",
      "topics": [
        "basic-tokenizers",
        "tokenize_characters",
        "tokenize_lines",
        "tokenize_paragraphs",
        "tokenize_regex",
        "tokenize_sentences",
        "tokenize_words"
      ]
    },
    {
      "page": "chunk_text",
      "title": "Chunk text into smaller segments",
      "topics": [
        "chunk_text"
      ]
    },
    {
      "page": "word-counting",
      "title": "Count words, sentences, characters",
      "topics": [
        "count_characters",
        "count_sentences",
        "count_words"
      ]
    },
    {
      "page": "mobydick",
      "title": "The text of Moby Dick",
      "topics": [
        "mobydick"
      ]
    },
    {
      "page": "ngram-tokenizers",
      "title": "N-gram tokenizers",
      "topics": [
        "ngram-tokenizers",
        "tokenize_ngrams",
        "tokenize_skip_ngrams"
      ]
    },
    {
      "page": "shingle-tokenizers",
      "title": "Character shingle tokenizers",
      "topics": [
        "tokenize_character_shingles"
      ]
    },
    {
      "page": "ptb-tokenizer",
      "title": "Penn Treebank Tokenizer",
      "topics": [
        "tokenize_ptb"
      ]
    },
    {
      "page": "stem-tokenizers",
      "title": "Word stem tokenizer",
      "topics": [
        "tokenize_word_stems"
      ]
    },
    {
      "page": "tokenizers",
      "title": "Tokenizers",
      "topics": [
        "tokenizers-package",
        "tokenizers"
      ]
    }
  ],
  "_readme": "https://github.com/ropensci/tokenizers/raw/master/README.md",
  "_rundeps": [
    "Rcpp",
    "SnowballC",
    "stringi"
  ],
  "_sysdeps": [
    {
      "shlib": "libstdc++",
      "package": "libstdc++6",
      "source": "gcc",
      "version": "16-20260322-1ubuntu1",
      "name": "c++",
      "homepage": "http://gcc.gnu.org/",
      "description": "GNU Standard C++ Library v3"
    }
  ],
  "_vignettes": [
    {
      "source": "introduction-to-tokenizers.Rmd",
      "filename": "introduction-to-tokenizers.html",
      "title": "Introduction to the tokenizers Package",
      "author": "Lincoln Mullen",
      "engine": "knitr::rmarkdown",
      "headings": [
        "Package overview",
        "Character and character-shingle tokenizers",
        "Word and word-stem tokenizers",
        "N-gram and skip n-gram tokenizers",
        "Sentence and paragraph tokenizers",
        "Text chunking",
        "Counting words, characters, sentences"
      ],
      "created": "2016-08-11 20:12:37",
      "modified": "2022-12-19 21:14:10",
      "commits": 12
    },
    {
      "source": "tif-and-tokenizers.Rmd",
      "filename": "tif-and-tokenizers.html",
      "title": "The Text Interchange Formats and the tokenizers Package",
      "author": "Lincoln Mullen",
      "engine": "knitr::rmarkdown",
      "headings": [],
      "created": "2018-03-14 00:10:35",
      "modified": "2022-09-23 18:07:51",
      "commits": 5
    }
  ],
  "_score": 13.452935541966198,
  "_indexed": true,
  "_nocasepkg": "tokenizers",
  "_universes": [
    "ropensci",
    "thomaschln"
  ],
  "_binaries": [
    {
      "r": "4.7.0",
      "os": "linux",
      "version": "0.3.1",
      "date": "2026-07-01T08:17:42.000Z",
      "distro": "resolute",
      "arch": "aarch64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "https://r2.ropensci.org/ff24889e2bc206fed1eaab633646ad81f3f34619bddadb0accf0de4c9883f809",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/28503290646"
    },
    {
      "r": "4.7.0",
      "os": "linux",
      "version": "0.3.1",
      "date": "2026-07-01T08:17:27.000Z",
      "distro": "resolute",
      "arch": "x86_64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "https://r2.ropensci.org/90a951980d1b550db6d5767ab3369dc2c551e976d29b70d0c4c7d512af3f8729",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/28503290646"
    },
    {
      "r": "4.6.1",
      "os": "linux",
      "version": "0.3.1",
      "date": "2026-07-01T08:17:37.000Z",
      "distro": "resolute",
      "arch": "aarch64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "https://r2.ropensci.org/a8c769077f5b5c98790e9b74676c3274051977ec453094e69022b5df04b13d42",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/28503290646"
    },
    {
      "r": "4.6.1",
      "os": "linux",
      "version": "0.3.1",
      "date": "2026-07-01T08:17:46.000Z",
      "distro": "resolute",
      "arch": "x86_64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "https://r2.ropensci.org/fff97b3c1899f55218231f216987db20955ad2fafaefbfa65c618a1ef5a08daf",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/28503290646"
    },
    {
      "r": "4.5.3",
      "os": "mac",
      "version": "0.3.1",
      "date": "2026-07-01T08:17:50.000Z",
      "arch": "aarch64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "https://r2.ropensci.org/534a0835d1939a9cb31aeacbf6962f8eba2f5c2d62b36f211e8185f45ad72662",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/28503290646"
    },
    {
      "r": "4.5.3",
      "os": "mac",
      "version": "0.3.1",
      "date": "2026-07-01T08:18:25.000Z",
      "arch": "x86_64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "https://r2.ropensci.org/d84053eb0691be48d5aaa2337716248098dfaf65b9bd252ba79ec937b72666f6",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/28503290646"
    },
    {
      "r": "4.6.1",
      "os": "mac",
      "version": "0.3.1",
      "date": "2026-07-01T08:17:45.000Z",
      "arch": "aarch64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "https://r2.ropensci.org/1a4a0c9594c152487e54a7690881ae3162597882cd042000d067e8fccb451a7a",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/28503290646"
    },
    {
      "r": "4.6.1",
      "os": "mac",
      "version": "0.3.1",
      "date": "2026-07-01T08:18:47.000Z",
      "arch": "x86_64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "https://r2.ropensci.org/f92d8da2a623fd65d56bdb00d98f187d4e04459e0502d4dc791304477ad27e43",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/28503290646"
    },
    {
      "r": "4.6.0",
      "os": "wasm",
      "version": "0.3.1",
      "date": "2026-07-01T08:17:43.000Z",
      "arch": "emscripten",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "https://r2.ropensci.org/223c64287f2a09f0359671d6e3eb7e92989a6aa0b41c3ddfe0dcee28b59c0a43",
      "status": "success",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/28503290646"
    },
    {
      "r": "4.7.0",
      "os": "win",
      "version": "0.3.1",
      "date": "2026-07-01T08:16:25.000Z",
      "arch": "x86_64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "https://r2.ropensci.org/8e0fac26c43661d1bd3cb13c03163193852953d6df36f2077fa762ff8afcdd15",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/28503290646"
    },
    {
      "r": "4.5.3",
      "os": "win",
      "version": "0.3.1",
      "date": "2026-07-01T08:16:19.000Z",
      "arch": "x86_64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "https://r2.ropensci.org/bca421a98d3da8b5f37f167b858233237fe257636ee14d8670ad6376841c8cfd",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/28503290646"
    },
    {
      "r": "4.6.1",
      "os": "win",
      "version": "0.3.1",
      "date": "2026-07-01T08:16:19.000Z",
      "arch": "x86_64",
      "commit": "b80863d088d4b39695b602ca11e061ac34770ec7",
      "fileid": "https://r2.ropensci.org/a7becc8f8ce987a643bae2dbd8e3e23ba7f0160898acdd4fca889b27652ed9af",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/ropensci/actions/runs/28503290646"
    }
  ]
}