Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BUG] Errors in MLM training of Bert #49

Closed
1 of 2 tasks
marscrazy opened this issue Jun 7, 2022 · 0 comments · Fixed by #50
Closed
1 of 2 tasks

[BUG] Errors in MLM training of Bert #49

marscrazy opened this issue Jun 7, 2022 · 0 comments · Fixed by #50
Assignees
Labels
bug Something isn't working

Comments

@marscrazy
Copy link
Contributor

Describe the bug
There is a error when I try to finetune bert model on masked langguage model learning task.

Tasks

  • An officially supported task in the examples folder (such as GLUE/Title-generation, ...)
  • My own task or dataset

To Reproduce
https://github.com/marscrazy/Tab2NL/blob/train_with_flagai/train_our_flagai.py

import os
import argparse
from data import get_dataset
from sklearn.metrics import roc_auc_score
import numpy as np
import random
import time
import torch
from flagai.trainer import Trainer
from flagai.auto_model.auto_loader import AutoLoader
from transformers import  DataCollatorForLanguageModeling, AutoTokenizer

def set_seed(SEED):
    torch.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    np.random.seed(SEED)
    random.seed(SEED)
    #torch.backends.cudnn.deterministic = True
set_seed(26)

def compute_metrics(predictions, labels, meta=None):
    predictions = predictions[:,1]
    return {'roc_auc':roc_auc_score(labels,predictions)}

class txtDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def finetuning_model(
        train_x, train_y, val_x, val_y, cv_fold=1, dataset_id=11,
        model_dir = "bert-base-ch", #bert-base-uncased
        is_mlm = False,
        num_train_epochs=10, #10
        per_device_train_batch_size=16,  # batch size per device during training
        per_device_eval_batch_size=32,  # batch size for evaluation
        warmup_steps=200,  # number of warmup steps for learning rate scheduler
        weight_decay=0.1,  # strength of weight decay
        logging_steps=100,#20
        seed=11,
        learning_rate=4e-5,
        metric_for_best_model=None,
        config = None,
        tokenizer = None,
        model = None,
        output_dir = None,
        logging_dir = None,
        return_model = False
):
    if output_dir is None:
        output_dir = './results/'+str(dataset_id)+'-cv-'+str(cv_fold)+'-mlm'
    if logging_dir is None:
        logging_dir = './logs/'+str(dataset_id)+'-cv-'+str(cv_fold)+'-mlm'
    #if config is None:
        # config = AutoConfig.from_pretrained(model_dir)
    #    import json
    #    config = json.load(open('./checkpoints/BERT-base-en/config.json'))

    if model is None:
        if is_mlm:
            auto_loader = AutoLoader(
            "masklm",
            model_name="BERT-base-en",
            model_dir='./checkpoints',
            )
        else:
            auto_loader = AutoLoader(
            "classification",
            model_name="BERT-base-en",
            model_dir='./checkpoints',
            class_num = 2
            )
        model = auto_loader.get_model()
        tokenizer = AutoTokenizer.from_pretrained("./checkpoints/BERT-base-en")
    train_encodings = tokenizer(train_x.tolist(), truncation=True, padding=True)
    val_encodings = tokenizer(val_x.tolist(), truncation=True, padding=True)
    train_dataset = txtDataset(train_encodings, train_y.astype(np.longlong))
    val_dataset = txtDataset(val_encodings, val_y.astype(np.longlong))
    if is_mlm:
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=tokenizer,
            mlm_probability=0.15
        )
    class MyTrainer(Trainer):
        def forward_step(self, data, model, mems):
            model_output = model(**{'input_ids':data['input_ids'],
                                  'segment_ids':data['token_type_ids'],
                                  'attention_mask':data['attention_mask']
                                 })
            print(model_output)
    trainer = MyTrainer(
        env_type='pytorch',
        epochs=num_train_epochs,
        weight_decay=weight_decay,
        log_interval=logging_steps,
        seed=seed,
        lr=learning_rate,
        save_dir=output_dir,
        tensorboard_dir=logging_dir
    )
    trainer.train(model=model,  # the instantiated 🤗 Transformers model to be trained
        train_dataset=train_dataset,  # training dataset
        valid_dataset=val_dataset,  # evaluation dataset
        metric_methods=[compute_metrics] if not is_mlm else [],
        collate_fn=data_collator if is_mlm else None)

    dir_name = os.listdir(output_dir)[0]
    cur_model_dir = os.path.join(output_dir,dir_name)
    del model
    torch.cuda.empty_cache()
    time.sleep(5)
    if return_model:
        return cur_model_dir, tokenizer, config
   
def train_ptm_cls(train_x,train_y,val_x, val_y, test_x, test_y, cv_fold=1, dataset_id=11,tokenizer=None, config=None,
                  model_dir = "../contrastive/resources/bert-base-uncased"):

    train_encodings = tokenizer(train_x.tolist(), truncation=True, padding=True)
    val_encodings = tokenizer(val_x.tolist(), truncation=True, padding=True)
    test_encodings = tokenizer(test_x.tolist(), truncation=True, padding=True)

    train_dataset = txtDataset(train_encodings, train_y.astype(np.longlong))
    test_dataset = txtDataset(test_encodings, test_y.astype(np.longlong))
    val_dataset = txtDataset(val_encodings, val_y.astype(np.longlong))
    model = AutoModelForSequenceClassification.from_pretrained(model_dir , config=config, from_tf=False,num_labels=2)
    output_dir = './results/'+str(dataset_id)+'-cv-'+str(cv_fold)+'-cls'
    log_dir = './logs/'+str(dataset_id)+'-cv-'+str(cv_fold)+'-cls'
    training_args = TrainingArguments(
        output_dir=output_dir,  # output directory
        num_train_epochs=10,  # total number of training epochs 
        per_device_train_batch_size=32,  # batch size per device during training
        per_device_eval_batch_size=32,  # batch size for evaluation
        warmup_steps=1000,  # number of warmup steps for learning rate scheduler
        weight_decay=0.1,  # strength of weight decay
        logging_dir=log_dir,  # directory for storing logs
        logging_steps=10, 
        eval_steps=10,
        save_steps=10,
        save_total_limit=1,
        do_eval=True,
        evaluation_strategy='steps',
        learning_rate=2e-5,
        seed=11,
        #save_strategy='steps',
        load_best_model_at_end=True,
        metric_for_best_model="roc_auc"
    )
    trainer = Trainer(
        model=model,  # the instantiated 🤗 Transformers model to be trained
        args=training_args,  # training arguments, defined above
        train_dataset=train_dataset,  # training dataset
        eval_dataset=test_dataset,  # evaluation dataset
        compute_metrics=compute_metrics
        #optimizers=(optimizer,None)
    )
    trainer.train()
    train_rs = trainer.evaluate(train_dataset)
    test_rs = trainer.evaluate(test_dataset)
    val_rs = trainer.evaluate(val_dataset)
    return train_rs['eval_roc_auc'], val_rs['eval_roc_auc'],test_rs['eval_roc_auc']


def train(dataset_id=1):
    ds = get_dataset(dataset_id=dataset_id)
    rs = []
    for i, (train_x, val_x, test_x, train_y, val_y, test_y) in enumerate(ds.generate_datasets(to_txt=True)):
        model_dir,tokenizer, config = finetuning_model(train_x,train_y,val_x, val_y,cv_fold=i, dataset_id=dataset_id,
                  model_dir = "../contrastive/resources/bert-base-uncased",is_mlm=True)
        train_auc, val_auc, test_auc = finetuning_model(
            train_x,train_y,val_x, val_y, test_x, test_y, cv_fold=i,dataset_id= dataset_id,tokenizer=tokenizer, config= config,
                  model_dir = model_dir,is_mlm=False)
        rs.append((train_auc,val_auc,test_auc))
        print("Train auc {:.3f}, val auc {:.3f}, Test auc {:.3f}".format(train_auc, val_auc, test_auc))

    for x,y,z in rs:
        print("Train auc {:.3f}, Val auc {:.3f}, Test auc {:.3f}".format(x,y,z))
    print("avg auc is {:.3f}\t{:.3f}".format(np.mean([x[-1] for x in rs]), np.std([x[-1] for x in rs])))
    #train_xgb(ds)

if __name__=="__main__":
    parser = argparse.ArgumentParser(description='Train Classifier with mixup', formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    # Data
    parser.add_argument('--model_dir', type=str, default='H:\\contrast\\SimCSE-main\\SimCSE-main\\bert-base-uncased',help='the path to pretrained models')
    parser.add_argument('--dataset_id', type=str, default='11',choices=['1','2','3','4','5','6','7','8','9','10','11'], help='Choose between 1-11.')
    # MLM pretrain
    parser.add_argument('--mlm_warmup_steps', default=1000, type=int, metavar='N', help='warmup steps (default: 1000)')
    parser.add_argument('--mlm_learning_rate', type=float, default=2e-5)
    parser.add_argument('--mlm_decay', type=float, default=0.1, help='weight decay (L2 penalty)')
    parser.add_argument('--mlm_epochs', type=int, default=300, help='number of epochs to train')
    parser.add_argument('--mlm_train_batch_size', type=int, default=32)
    parser.add_argument('--mlm_eval_batch_size', type=int, default=32)
    parser.add_argument('--mlm_logging_steps', default=10, type=int, metavar='N', help='logging frequency (default: 10)')
    # text classification
    parser.add_argument('--cls_epochs', type=int, default=300, help='number of epochs to train')
    parser.add_argument('--cls_train_batch_size', type=int, default=32)
    parser.add_argument('--cls_eval_batch_size', type=int, default=32)
    parser.add_argument('--cls_warmup_steps', default=1000, type=int, metavar='N', help='warmup steps (default: 1000)')
    parser.add_argument('--cls_decay', type=float, default=0.1, help='weight decay (L2 penalty)')
    parser.add_argument('--cls_logging_steps', default=10, type=int, metavar='N', help='logging frequency (default: 10)')
    parser.add_argument('--cls_learning_rate', type=float, default=2e-5)
    # Optimization options
    #parser.add_argument('--train', type=str, default='vanilla', choices=['vanilla', 'mixup', 'mixup_hidden', 'SRRS'], help='mixup layer')
    # training
    #parser.add_argument('--momentum', type=float, default=0.9)
    #parser.add_argument('--schedule', type=int, nargs='+', default=[150, 225], help='decrease learning rate at these epochs')
    #parser.add_argument('--gammas', type=float, nargs='+', default=[0.1, 0.1], help='LR is multiplied by gamma on schedule, number of gammas should be equal to schedule')

    # Checkpoints
    parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)')
    parser.add_argument('--start_epoch', default=0, type=int, metavar='N', help='manual epoch number (useful on restarts)')
    # random seed
    parser.add_argument('--seed', default=0, type=int, help='manual seed')
    parser.add_argument('--add_name', type=str, default='')
    parser.add_argument('--job_id', type=str, default='')
    args = parser.parse_args()
    ds = get_dataset(dataset_id=int(args.dataset_id))
    rs = []
    for i, (train_x, val_x, test_x, train_y, val_y, test_y) in enumerate(ds.generate_datasets(to_txt=True,with_title=True if args.dataset_id not in ['1','3'] else False)):
        model_dir,tokenizer, config = finetuning_model(train_x, train_y, val_x, val_y,cv_fold=i, dataset_id=args.dataset_id,
        model_dir = "hkunlp/T5_large_prefix_all_tasks_2upsample2",#bert-base-uncased,hkunlp/from_all_T5_large_prefix_sql2text2
        is_mlm = True,
        num_train_epochs=10,  #args.mlm_epochs,10
        per_device_train_batch_size=args.mlm_train_batch_size,  # batch size per device during training
        per_device_eval_batch_size=args.mlm_eval_batch_size,  # batch size for evaluation
        warmup_steps=args.mlm_warmup_steps,  # number of warmup steps for learning rate scheduler
        weight_decay=args.mlm_decay,  # strength of weight decay
        logging_steps=100,#20
        seed=11,
        learning_rate=4e-5,
        metric_for_best_model=None,
        config = None,
        tokenizer = None,
        model = None,
        output_dir = None,
        logging_dir = None,
        return_model = False)
        
        model_dir,tokenizer,config, trainer= finetuning_model(
            train_x, train_y, val_x, val_y, cv_fold=i,dataset_id= args.dataset_id,tokenizer=tokenizer, config= config,
                  model_dir = model_dir,is_mlm=False, return_model=True)
        test_encodings = tokenizer(test_x.tolist(), truncation=True, padding=True)
        test_dataset = txtDataset(test_encodings, test_y.astype(np.longlong))
        test_auc = trainer.evaluate(test_dataset)['eval_roc_auc']
        rs.append(test_auc)
        print("Test auc {:.3f}".format(test_auc))
    print("avg auc is {:.3f}\t{:.3f}".format(np.mean(rs),np.std(rs)))

Expected behavior
fine-tuning BERT on MLM and classification tasks

Screenshots
If applicable, add screenshots to help explain your problem.

image

OS (please complete the following information):

  • OS: [e.g. ubuntu18.04]
  • Version [e.g. v1.0.0]
@marscrazy marscrazy added the bug Something isn't working label Jun 7, 2022
@marscrazy marscrazy linked a pull request Jun 7, 2022 that will close this issue
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug Something isn't working
Projects
None yet
Development

Successfully merging a pull request may close this issue.

2 participants