Document Classification

This library contains a single utility class and several functions to make it easy to run a simple document classifier for scientific papers.

An example code run through (that leverages the ‘Disease Research State Model’, see https://github.com/chanzuckerberg/DRSM-corpus) is as follows:

# DRSM BASIC TRAINING ANALYSIS  
import datasets
from czLandscapingTk.docClassify import HF_trainer_wrapper,run_HF_trainer_expt,run_HF_trainer_kfold_crossvalidation,get_folds_from_dataframe
import pandas as pd

column_names =['ID_PAPER', 'Labeling_State', 'Comments', 'Explanation', 'Correct_Label', 'Agreement', 'TRIMMED_TEXT']
text_columns = ['TRIMMED_TEXT']
label_column = 'Correct_Label'
drsm_categories = ['characteristics or disease pathology',
               'therapeutics in the clinic', 
               'disease mechanism', 
               'patient-based therapeutics', 
               'other',
               'irrelevant']

# Load data into a dataframe
# see https://github.com/chanzuckerberg/DRSM-corpus/blob/main/v1/drsm_corpus_v1.tsv
ml_df =  pd.read_csv('/path/to/train/test/data.csv')
column_names =['Origin', 'Labeling_State', 'Correct_Label', 'Agreement', 'Title', 'Abstract']
text_columns = ['Title', 'Abstract']
label_column = 'Correct_Label'
categories = sorted(ml_df[label_column].unique())
folds = get_folds_from_dataframe(ml_df, 'Origin', 'Correct_Label', 8)

# set up training / validation / test split
train_test_valid = ds_temp['train'].train_test_split(0.1)
test_valid = train_test_valid['test'].train_test_split(0.5)

#  load the data into a DatasetDict 
drsm_ds = datasets.DatasetDict({
     'train': train_test_valid['train'],
     'test': test_valid['test'],
     'valid': test_valid['train']})
 

run_name = 'drsm_experiment_pubmed_bert'
model_input = 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract'
model_path = '/path/to/model/single_pubmedbert_model'
log_path = '/path/to/logs/single_pubmedbert_model'
epochs = 6

df_results = run_HF_trainer_kfold_crossvalidation(folds, text_columns, label_column, categories, 
                                                     run_name, model_input, model_path, log_path, epochs,
                                                     batch_size=8, problem_type='single_label_classification')
                                                     
df_results.to_csv(model_path+'/run_data.tsv', sep='\t')
df_results # to see the results

source

HF_trainer_wrapper

 HF_trainer_wrapper (run_name, model_ckpt, output_dir, logging_dir,
                     epochs, max_length=512,
                     problem_type='multi_label_classification')

Class to provide support training and experimenting with simple document classification tools under either a multi-label or multi-class classification paradigm.

source

run_HF_trainer_expt

 run_HF_trainer_expt (ds, text_columns, label_column, categories,
                      run_name, model_input, model_path, log_path, epochs,
                      batch_size=8, transfer_model=None,
                      problem_type='multi_label_classification',
                      run_training=True, freeze_layers=False)

Runs an single experiment with

source

get_folds_from_dataframe

 get_folds_from_dataframe (df, id_col, category_col, n_splits)

source

run_HF_trainer_kfold_crossvalidation

 run_HF_trainer_kfold_crossvalidation (folds, text_columns, label_column,
                                       categories, run_name, model_input,
                                       model_path, log_path, epochs,
                                       batch_size=8, problem_type='multi_l
                                       abel_classification',
                                       transfer_model=None,
                                       run_training=True,
                                       freeze_layers=False)