parent
599dcc2a0c
commit
2f059aa3f0
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,11 @@ |
|||||||
|
import t from 'tap'; |
||||||
|
|
||||||
|
import { extractWordsFromFile } from './oooThesaurusParser.js'; |
||||||
|
|
||||||
|
void t.test('extractWordsFromFile', async (t) => { |
||||||
|
const words = await extractWordsFromFile( |
||||||
|
new URL('../../build-resources/th-en-x-basic.dat', import.meta.url), |
||||||
|
); |
||||||
|
//console.log(words);
|
||||||
|
t.equal(words.length, 12371); |
||||||
|
}); |
@ -0,0 +1,41 @@ |
|||||||
|
import type { PathLike } from 'node:fs'; |
||||||
|
import fs from 'node:fs/promises'; |
||||||
|
import _ from 'lodash'; |
||||||
|
|
||||||
|
const splitList = <T>( |
||||||
|
list: T[], |
||||||
|
isSeparator: (element: T) => boolean, |
||||||
|
): T[][] => { |
||||||
|
const result: T[][] = []; |
||||||
|
for (const element of list) { |
||||||
|
if (isSeparator(element)) { |
||||||
|
result.push([]); |
||||||
|
} else { |
||||||
|
// `result` is never empty, so it's guaranteed to have the last element
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
|
||||||
|
result[result.length - 1]!.push(element); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
return result; |
||||||
|
}; |
||||||
|
|
||||||
|
const WORD_REGEX = /^[a-zA-Z']+$/gi; |
||||||
|
|
||||||
|
const extractWordsFromThesaurus = (thesaurus: string): string[] => { |
||||||
|
const lines = thesaurus.split('\n').slice(1); |
||||||
|
const chunks = splitList(lines, (line) => !line.trim().length); |
||||||
|
return _.compact( |
||||||
|
chunks |
||||||
|
.filter((chunk) => chunk.length) |
||||||
|
.map((chunk) => chunk[0]?.split('|')?.[0]), |
||||||
|
).filter(WORD_REGEX.test.bind(WORD_REGEX)); |
||||||
|
}; |
||||||
|
|
||||||
|
export const extractWordsFromFile = async ( |
||||||
|
filePath: PathLike, |
||||||
|
): Promise<string[]> => { |
||||||
|
return extractWordsFromThesaurus( |
||||||
|
await fs.readFile(filePath, { encoding: 'ascii' }), |
||||||
|
); |
||||||
|
}; |
Loading…
Reference in new issue