Textual
can help you
Obtaining data
We can extract data from following sources:
- a URL (http or https)
- a path to a textual file (.txt, not .doc/.docx)
- an interactive interface in the notebook, asking to upload the file
Prepare model training
We can prepare model training in the following way:
- Pytorch datasets, data collector for dataloader
- Create a hugging face trainer
uploaded = Textual.from_upload()
textual = uploaded()
textual
Preview textual data by page
textual()
A TransformBlock
, designed for huggingface tokenizers
# For Test Cases (might have duplicate import because it will be in a dedicated file)
from pathlib import Path
from typing import List
import pytest
from test_common.utils_4_tests import DATA_DIR
from test_utils import GITHUB_TEST_DATA_URL, check_connection_github
LOCAL_TEST_TXT = DATA_DIR / "to_download.txt"
GITHUB_TEST_TXT = f"{GITHUB_TEST_DATA_URL}/to_download.txt"
LOCAL_TEST_TXT_UTF8 = DATA_DIR / "Deep learning.txt"
GITHUB_TEST_TXT_UTF8 = f"{GITHUB_TEST_DATA_URL}/Deep%20learning.txt"
@pytest.fixture(scope="session")
def test_txt_content():
return LOCAL_TEST_TXT.read_text()
@pytest.fixture(scope="session")
def local_textual(test_txt_content):
return Textual(test_txt_content)
class Test_Textual:
def test_init(self, local_textual, test_txt_content):
"""Test initialization of Textual from text"""
expected_txt = test_txt_content.replace("\n", " ").replace("\r", "")
assert local_textual.text == expected_txt
def test_init_encoding(self):
"""Test initialization of Textual from text"""
content = LOCAL_TEST_TXT_UTF8.read_text(encoding="utf-8")
textual = Textual(content)
assert textual.text == content.replace("\n", " ").replace("\r", "")
def test_from_path(self, local_textual):
"""Test create Textual from path (existing)"""
textual = Textual.from_path(LOCAL_TEST_TXT)
assert textual.text == local_textual.text
def test_from_path_error(self):
"""Test extract Textual of file that does not exist"""
with pytest.raises(FileExistsError):
textual = Textual.from_path("does_not_exist.txt")
@pytest.mark.github
def test_from_url(self, check_connection_github, local_textual):
"""Test extract Textual from URL"""
textual = Textual.from_url(GITHUB_TEST_TXT)
assert textual.text == local_textual.text, f"URL text: {textual.text}"
@pytest.mark.github
def test_from_url_non_ascii(self, check_connection_github):
"""Test extract Textual from URL with non-ascii characters"""
textual = Textual.from_url(GITHUB_TEST_TXT_UTF8)
content = LOCAL_TEST_TXT_UTF8.read_text(encoding="utf-8")
content = content.replace("\n", " ").replace("\r", "")
assert textual.text == content, f"URL text: {textual.text}"