parent
599dcc2a0c
commit
2f059aa3f0
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,11 @@ |
||||
import t from 'tap'; |
||||
|
||||
import { extractWordsFromFile } from './oooThesaurusParser.js'; |
||||
|
||||
void t.test('extractWordsFromFile', async (t) => { |
||||
const words = await extractWordsFromFile( |
||||
new URL('../../build-resources/th-en-x-basic.dat', import.meta.url), |
||||
); |
||||
//console.log(words);
|
||||
t.equal(words.length, 12371); |
||||
}); |
@ -0,0 +1,41 @@ |
||||
import type { PathLike } from 'node:fs'; |
||||
import fs from 'node:fs/promises'; |
||||
import _ from 'lodash'; |
||||
|
||||
const splitList = <T>( |
||||
list: T[], |
||||
isSeparator: (element: T) => boolean, |
||||
): T[][] => { |
||||
const result: T[][] = []; |
||||
for (const element of list) { |
||||
if (isSeparator(element)) { |
||||
result.push([]); |
||||
} else { |
||||
// `result` is never empty, so it's guaranteed to have the last element
|
||||
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
|
||||
result[result.length - 1]!.push(element); |
||||
} |
||||
} |
||||
|
||||
return result; |
||||
}; |
||||
|
||||
const WORD_REGEX = /^[a-zA-Z']+$/gi; |
||||
|
||||
const extractWordsFromThesaurus = (thesaurus: string): string[] => { |
||||
const lines = thesaurus.split('\n').slice(1); |
||||
const chunks = splitList(lines, (line) => !line.trim().length); |
||||
return _.compact( |
||||
chunks |
||||
.filter((chunk) => chunk.length) |
||||
.map((chunk) => chunk[0]?.split('|')?.[0]), |
||||
).filter(WORD_REGEX.test.bind(WORD_REGEX)); |
||||
}; |
||||
|
||||
export const extractWordsFromFile = async ( |
||||
filePath: PathLike, |
||||
): Promise<string[]> => { |
||||
return extractWordsFromThesaurus( |
||||
await fs.readFile(filePath, { encoding: 'ascii' }), |
||||
); |
||||
}; |
Loading…
Reference in new issue