import xml.etree.ElementTree as ET import zipfile from pathlib import Path import gdown
defget_data( url: str, zip_path: str, raw_train_path: str, raw_test_path: str, processed_train_path: str, processed_test_path: str, ): # Download data from Google Drive zip_path = "Twitter.zip" gdown.download(url, zip_path, quiet=False)
# Unzip data with zipfile.ZipFile(zip_path, "r") as zip_ref: zip_ref.extractall(".")
# Extract texts from files in the train directory t_train = [] for file_path in Path(raw_train_path).glob("*.xml"): list_train_doc_1 = [r.text for r in ET.parse(file_path).getroot()[0]] train_doc_1 = " ".join(t for t in list_train_doc_1) t_train.append(train_doc_1) t_train_docs = " ".join(t_train)
# Extract texts from files in the test directory t_test = [] for file_path in Path(raw_test_path).glob("*.xml"): list_test_doc_1 = [r.text for r in ET.parse(file_path).getroot()[0]] test_doc_1 = " ".join(t for t in list_test_doc_1) t_test.append(test_doc_1) t_test_docs = " ".join(t_test)
# Write processed data to a train file with open(processed_train_path, "w") as f: f.write(t_train_docs)
# Write processed data to a test file with open(processed_test_path, "w") as f: f.write(t_test_docs)
# 從train目錄下的文件中提取文本 t_train = [] for file_path in Path(raw_train_path).glob("*.xml"): list_train_doc_1 = [r.text for r in ET.parse(file_path).getroot()[0]] train_doc_1 = " ".join(t for t in list_train_doc_1) t_train.append(train_doc_1) t_train_docs = " ".join(t_train)
# 從測試目錄的文件中提取文本 t_test = [] for file_path in Path(raw_test_path).glob("*.xml"): list_test_doc_1 = [r.text for r in ET.parse(file_path).getroot()[0]] test_doc_1 = " ".join(t for t in list_test_doc_1) t_test.append(test_doc_1) t_test_docs = " ".join(t_test)
all_docs = [] for file_path in Path(folder_path).glob("*.xml"): list_of_text_in_one_file = [r.text for r in ET.parse(file_path).getroot()[0]] text_in_one_file = " ".join(list_of_text_in_one_file) all_docs.append(text_in_one_file)
all_docs = [] for file_path in Path(folder_path).glob("*.xml"): list_of_text_in_one_file = [r.text for r in ET.parse(file_path).getroot()[0]] text_in_one_file = " ".join(list_of_text_in_one_file) all_docs.append(text_in_one_file)
return" ".join(all_docs)
該函數(shù)本身處于較高層次,但 for 循環(huán)內(nèi)的代碼涉及與XML解析、文本提取和字符串操作有關(guān)的較低層次的操作。
defextract_texts_from_multiple_files(folder_path: str) -> str: all_docs = [] for file_path in Path(folder_path).glob("*.xml"): text_in_one_file = extract_texts_from_each_file(file_path) all_docs.append(text_in_one_file)
return" ".join(all_docs)
defextract_texts_from_each_file(file_path: str) -> str: list_of_text_in_one_file = [r.text for r in ET.parse(file_path).getroot()[0]] return" ".join(list_of_text_in_one_file)