We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Describe the bug There is a error when I try to finetune bert model on masked langguage model learning task.
Tasks
To Reproduce https://github.com/marscrazy/Tab2NL/blob/train_with_flagai/train_our_flagai.py
import os import argparse from data import get_dataset from sklearn.metrics import roc_auc_score import numpy as np import random import time import torch from flagai.trainer import Trainer from flagai.auto_model.auto_loader import AutoLoader from transformers import DataCollatorForLanguageModeling, AutoTokenizer def set_seed(SEED): torch.manual_seed(SEED) torch.cuda.manual_seed_all(SEED) np.random.seed(SEED) random.seed(SEED) #torch.backends.cudnn.deterministic = True set_seed(26) def compute_metrics(predictions, labels, meta=None): predictions = predictions[:,1] return {'roc_auc':roc_auc_score(labels,predictions)} class txtDataset(torch.utils.data.Dataset): def __init__(self, encodings, labels): self.encodings = encodings self.labels = labels def __getitem__(self, idx): item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} item['labels'] = torch.tensor(self.labels[idx]) return item def __len__(self): return len(self.labels) def finetuning_model( train_x, train_y, val_x, val_y, cv_fold=1, dataset_id=11, model_dir = "bert-base-ch", #bert-base-uncased is_mlm = False, num_train_epochs=10, #10 per_device_train_batch_size=16, # batch size per device during training per_device_eval_batch_size=32, # batch size for evaluation warmup_steps=200, # number of warmup steps for learning rate scheduler weight_decay=0.1, # strength of weight decay logging_steps=100,#20 seed=11, learning_rate=4e-5, metric_for_best_model=None, config = None, tokenizer = None, model = None, output_dir = None, logging_dir = None, return_model = False ): if output_dir is None: output_dir = './results/'+str(dataset_id)+'-cv-'+str(cv_fold)+'-mlm' if logging_dir is None: logging_dir = './logs/'+str(dataset_id)+'-cv-'+str(cv_fold)+'-mlm' #if config is None: # config = AutoConfig.from_pretrained(model_dir) # import json # config = json.load(open('./checkpoints/BERT-base-en/config.json')) if model is None: if is_mlm: auto_loader = AutoLoader( "masklm", model_name="BERT-base-en", model_dir='./checkpoints', ) else: auto_loader = AutoLoader( "classification", model_name="BERT-base-en", model_dir='./checkpoints', class_num = 2 ) model = auto_loader.get_model() tokenizer = AutoTokenizer.from_pretrained("./checkpoints/BERT-base-en") train_encodings = tokenizer(train_x.tolist(), truncation=True, padding=True) val_encodings = tokenizer(val_x.tolist(), truncation=True, padding=True) train_dataset = txtDataset(train_encodings, train_y.astype(np.longlong)) val_dataset = txtDataset(val_encodings, val_y.astype(np.longlong)) if is_mlm: data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm_probability=0.15 ) class MyTrainer(Trainer): def forward_step(self, data, model, mems): model_output = model(**{'input_ids':data['input_ids'], 'segment_ids':data['token_type_ids'], 'attention_mask':data['attention_mask'] }) print(model_output) trainer = MyTrainer( env_type='pytorch', epochs=num_train_epochs, weight_decay=weight_decay, log_interval=logging_steps, seed=seed, lr=learning_rate, save_dir=output_dir, tensorboard_dir=logging_dir ) trainer.train(model=model, # the instantiated 🤗 Transformers model to be trained train_dataset=train_dataset, # training dataset valid_dataset=val_dataset, # evaluation dataset metric_methods=[compute_metrics] if not is_mlm else [], collate_fn=data_collator if is_mlm else None) dir_name = os.listdir(output_dir)[0] cur_model_dir = os.path.join(output_dir,dir_name) del model torch.cuda.empty_cache() time.sleep(5) if return_model: return cur_model_dir, tokenizer, config def train_ptm_cls(train_x,train_y,val_x, val_y, test_x, test_y, cv_fold=1, dataset_id=11,tokenizer=None, config=None, model_dir = "../contrastive/resources/bert-base-uncased"): train_encodings = tokenizer(train_x.tolist(), truncation=True, padding=True) val_encodings = tokenizer(val_x.tolist(), truncation=True, padding=True) test_encodings = tokenizer(test_x.tolist(), truncation=True, padding=True) train_dataset = txtDataset(train_encodings, train_y.astype(np.longlong)) test_dataset = txtDataset(test_encodings, test_y.astype(np.longlong)) val_dataset = txtDataset(val_encodings, val_y.astype(np.longlong)) model = AutoModelForSequenceClassification.from_pretrained(model_dir , config=config, from_tf=False,num_labels=2) output_dir = './results/'+str(dataset_id)+'-cv-'+str(cv_fold)+'-cls' log_dir = './logs/'+str(dataset_id)+'-cv-'+str(cv_fold)+'-cls' training_args = TrainingArguments( output_dir=output_dir, # output directory num_train_epochs=10, # total number of training epochs per_device_train_batch_size=32, # batch size per device during training per_device_eval_batch_size=32, # batch size for evaluation warmup_steps=1000, # number of warmup steps for learning rate scheduler weight_decay=0.1, # strength of weight decay logging_dir=log_dir, # directory for storing logs logging_steps=10, eval_steps=10, save_steps=10, save_total_limit=1, do_eval=True, evaluation_strategy='steps', learning_rate=2e-5, seed=11, #save_strategy='steps', load_best_model_at_end=True, metric_for_best_model="roc_auc" ) trainer = Trainer( model=model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=train_dataset, # training dataset eval_dataset=test_dataset, # evaluation dataset compute_metrics=compute_metrics #optimizers=(optimizer,None) ) trainer.train() train_rs = trainer.evaluate(train_dataset) test_rs = trainer.evaluate(test_dataset) val_rs = trainer.evaluate(val_dataset) return train_rs['eval_roc_auc'], val_rs['eval_roc_auc'],test_rs['eval_roc_auc'] def train(dataset_id=1): ds = get_dataset(dataset_id=dataset_id) rs = [] for i, (train_x, val_x, test_x, train_y, val_y, test_y) in enumerate(ds.generate_datasets(to_txt=True)): model_dir,tokenizer, config = finetuning_model(train_x,train_y,val_x, val_y,cv_fold=i, dataset_id=dataset_id, model_dir = "../contrastive/resources/bert-base-uncased",is_mlm=True) train_auc, val_auc, test_auc = finetuning_model( train_x,train_y,val_x, val_y, test_x, test_y, cv_fold=i,dataset_id= dataset_id,tokenizer=tokenizer, config= config, model_dir = model_dir,is_mlm=False) rs.append((train_auc,val_auc,test_auc)) print("Train auc {:.3f}, val auc {:.3f}, Test auc {:.3f}".format(train_auc, val_auc, test_auc)) for x,y,z in rs: print("Train auc {:.3f}, Val auc {:.3f}, Test auc {:.3f}".format(x,y,z)) print("avg auc is {:.3f}\t{:.3f}".format(np.mean([x[-1] for x in rs]), np.std([x[-1] for x in rs]))) #train_xgb(ds) if __name__=="__main__": parser = argparse.ArgumentParser(description='Train Classifier with mixup', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Data parser.add_argument('--model_dir', type=str, default='H:\\contrast\\SimCSE-main\\SimCSE-main\\bert-base-uncased',help='the path to pretrained models') parser.add_argument('--dataset_id', type=str, default='11',choices=['1','2','3','4','5','6','7','8','9','10','11'], help='Choose between 1-11.') # MLM pretrain parser.add_argument('--mlm_warmup_steps', default=1000, type=int, metavar='N', help='warmup steps (default: 1000)') parser.add_argument('--mlm_learning_rate', type=float, default=2e-5) parser.add_argument('--mlm_decay', type=float, default=0.1, help='weight decay (L2 penalty)') parser.add_argument('--mlm_epochs', type=int, default=300, help='number of epochs to train') parser.add_argument('--mlm_train_batch_size', type=int, default=32) parser.add_argument('--mlm_eval_batch_size', type=int, default=32) parser.add_argument('--mlm_logging_steps', default=10, type=int, metavar='N', help='logging frequency (default: 10)') # text classification parser.add_argument('--cls_epochs', type=int, default=300, help='number of epochs to train') parser.add_argument('--cls_train_batch_size', type=int, default=32) parser.add_argument('--cls_eval_batch_size', type=int, default=32) parser.add_argument('--cls_warmup_steps', default=1000, type=int, metavar='N', help='warmup steps (default: 1000)') parser.add_argument('--cls_decay', type=float, default=0.1, help='weight decay (L2 penalty)') parser.add_argument('--cls_logging_steps', default=10, type=int, metavar='N', help='logging frequency (default: 10)') parser.add_argument('--cls_learning_rate', type=float, default=2e-5) # Optimization options #parser.add_argument('--train', type=str, default='vanilla', choices=['vanilla', 'mixup', 'mixup_hidden', 'SRRS'], help='mixup layer') # training #parser.add_argument('--momentum', type=float, default=0.9) #parser.add_argument('--schedule', type=int, nargs='+', default=[150, 225], help='decrease learning rate at these epochs') #parser.add_argument('--gammas', type=float, nargs='+', default=[0.1, 0.1], help='LR is multiplied by gamma on schedule, number of gammas should be equal to schedule') # Checkpoints parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') parser.add_argument('--start_epoch', default=0, type=int, metavar='N', help='manual epoch number (useful on restarts)') # random seed parser.add_argument('--seed', default=0, type=int, help='manual seed') parser.add_argument('--add_name', type=str, default='') parser.add_argument('--job_id', type=str, default='') args = parser.parse_args() ds = get_dataset(dataset_id=int(args.dataset_id)) rs = [] for i, (train_x, val_x, test_x, train_y, val_y, test_y) in enumerate(ds.generate_datasets(to_txt=True,with_title=True if args.dataset_id not in ['1','3'] else False)): model_dir,tokenizer, config = finetuning_model(train_x, train_y, val_x, val_y,cv_fold=i, dataset_id=args.dataset_id, model_dir = "hkunlp/T5_large_prefix_all_tasks_2upsample2",#bert-base-uncased,hkunlp/from_all_T5_large_prefix_sql2text2 is_mlm = True, num_train_epochs=10, #args.mlm_epochs,10 per_device_train_batch_size=args.mlm_train_batch_size, # batch size per device during training per_device_eval_batch_size=args.mlm_eval_batch_size, # batch size for evaluation warmup_steps=args.mlm_warmup_steps, # number of warmup steps for learning rate scheduler weight_decay=args.mlm_decay, # strength of weight decay logging_steps=100,#20 seed=11, learning_rate=4e-5, metric_for_best_model=None, config = None, tokenizer = None, model = None, output_dir = None, logging_dir = None, return_model = False) model_dir,tokenizer,config, trainer= finetuning_model( train_x, train_y, val_x, val_y, cv_fold=i,dataset_id= args.dataset_id,tokenizer=tokenizer, config= config, model_dir = model_dir,is_mlm=False, return_model=True) test_encodings = tokenizer(test_x.tolist(), truncation=True, padding=True) test_dataset = txtDataset(test_encodings, test_y.astype(np.longlong)) test_auc = trainer.evaluate(test_dataset)['eval_roc_auc'] rs.append(test_auc) print("Test auc {:.3f}".format(test_auc)) print("avg auc is {:.3f}\t{:.3f}".format(np.mean(rs),np.std(rs)))
Expected behavior fine-tuning BERT on MLM and classification tasks
Screenshots If applicable, add screenshots to help explain your problem.
OS (please complete the following information):
The text was updated successfully, but these errors were encountered:
920232796
Successfully merging a pull request may close this issue.
Describe the bug
There is a error when I try to finetune bert model on masked langguage model learning task.
Tasks
To Reproduce
https://github.com/marscrazy/Tab2NL/blob/train_with_flagai/train_our_flagai.py
Expected behavior
fine-tuning BERT on MLM and classification tasks
Screenshots
If applicable, add screenshots to help explain your problem.
OS (please complete the following information):
The text was updated successfully, but these errors were encountered: