implemented extraction of words list from OOO thesaurus

main
Inga 🏳‍🌈 10 months ago
parent 599dcc2a0c
commit 2f059aa3f0
  1. 90044
      build-resources/th-en-x-basic.dat
  2. 13
      package-lock.json
  3. 2
      package.json
  4. 11
      src/build-scripts/oooThesaurusParser.test.ts
  5. 41
      src/build-scripts/oooThesaurusParser.ts

File diff suppressed because it is too large Load Diff

13
package-lock.json generated

@ -15,6 +15,7 @@
"@babel/plugin-syntax-jsx": "^7.23.3", "@babel/plugin-syntax-jsx": "^7.23.3",
"@preact/preset-vite": "^2.5.0", "@preact/preset-vite": "^2.5.0",
"@tsconfig/strictest": "^2.0.2", "@tsconfig/strictest": "^2.0.2",
"@types/lodash": "^4.14.202",
"@typescript-eslint/eslint-plugin": "^6.11.0", "@typescript-eslint/eslint-plugin": "^6.11.0",
"@typescript-eslint/parser": "^6.11.0", "@typescript-eslint/parser": "^6.11.0",
"eslint": "^8.53.0", "eslint": "^8.53.0",
@ -23,6 +24,7 @@
"eslint-plugin-prettier": "^5.0.1", "eslint-plugin-prettier": "^5.0.1",
"eslint-plugin-react": "^7.33.2", "eslint-plugin-react": "^7.33.2",
"eslint-plugin-react-hooks": "^4.6.0", "eslint-plugin-react-hooks": "^4.6.0",
"lodash": "^4.17.21",
"serve": "^14.2.1", "serve": "^14.2.1",
"tap": "^18.6.1", "tap": "^18.6.1",
"typescript": "^5.2.2", "typescript": "^5.2.2",
@ -2031,6 +2033,12 @@
"dev": true, "dev": true,
"license": "MIT" "license": "MIT"
}, },
"node_modules/@types/lodash": {
"version": "4.14.202",
"resolved": "https://registry.npmjs.org/@types/lodash/-/lodash-4.14.202.tgz",
"integrity": "sha512-OvlIYQK9tNneDlS0VN54LLd5uiPCBOp7gS5Z0f1mjoJYBrtStzgmJBxONW3U6OZqdtNzZPmn9BS/7WI7BFFcFQ==",
"dev": true
},
"node_modules/@types/node": { "node_modules/@types/node": {
"version": "20.9.1", "version": "20.9.1",
"dev": true, "dev": true,
@ -5719,8 +5727,9 @@
}, },
"node_modules/lodash": { "node_modules/lodash": {
"version": "4.17.21", "version": "4.17.21",
"dev": true, "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz",
"license": "MIT" "integrity": "sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg==",
"dev": true
}, },
"node_modules/lodash.memoize": { "node_modules/lodash.memoize": {
"version": "4.1.2", "version": "4.1.2",

@ -22,6 +22,7 @@
"@babel/plugin-syntax-jsx": "^7.23.3", "@babel/plugin-syntax-jsx": "^7.23.3",
"@preact/preset-vite": "^2.5.0", "@preact/preset-vite": "^2.5.0",
"@tsconfig/strictest": "^2.0.2", "@tsconfig/strictest": "^2.0.2",
"@types/lodash": "^4.14.202",
"@typescript-eslint/eslint-plugin": "^6.11.0", "@typescript-eslint/eslint-plugin": "^6.11.0",
"@typescript-eslint/parser": "^6.11.0", "@typescript-eslint/parser": "^6.11.0",
"eslint": "^8.53.0", "eslint": "^8.53.0",
@ -30,6 +31,7 @@
"eslint-plugin-prettier": "^5.0.1", "eslint-plugin-prettier": "^5.0.1",
"eslint-plugin-react": "^7.33.2", "eslint-plugin-react": "^7.33.2",
"eslint-plugin-react-hooks": "^4.6.0", "eslint-plugin-react-hooks": "^4.6.0",
"lodash": "^4.17.21",
"serve": "^14.2.1", "serve": "^14.2.1",
"tap": "^18.6.1", "tap": "^18.6.1",
"typescript": "^5.2.2", "typescript": "^5.2.2",

@ -0,0 +1,11 @@
import t from 'tap';
import { extractWordsFromFile } from './oooThesaurusParser.js';
void t.test('extractWordsFromFile', async (t) => {
const words = await extractWordsFromFile(
new URL('../../build-resources/th-en-x-basic.dat', import.meta.url),
);
//console.log(words);
t.equal(words.length, 12371);
});

@ -0,0 +1,41 @@
import type { PathLike } from 'node:fs';
import fs from 'node:fs/promises';
import _ from 'lodash';
const splitList = <T>(
list: T[],
isSeparator: (element: T) => boolean,
): T[][] => {
const result: T[][] = [];
for (const element of list) {
if (isSeparator(element)) {
result.push([]);
} else {
// `result` is never empty, so it's guaranteed to have the last element
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
result[result.length - 1]!.push(element);
}
}
return result;
};
const WORD_REGEX = /^[a-zA-Z']+$/gi;
const extractWordsFromThesaurus = (thesaurus: string): string[] => {
const lines = thesaurus.split('\n').slice(1);
const chunks = splitList(lines, (line) => !line.trim().length);
return _.compact(
chunks
.filter((chunk) => chunk.length)
.map((chunk) => chunk[0]?.split('|')?.[0]),
).filter(WORD_REGEX.test.bind(WORD_REGEX));
};
export const extractWordsFromFile = async (
filePath: PathLike,
): Promise<string[]> => {
return extractWordsFromThesaurus(
await fs.readFile(filePath, { encoding: 'ascii' }),
);
};
Loading…
Cancel
Save