fromdatasetsimportDataset,DatasetDictds=Dataset.from_pandas(df)importwarnings,logging,torchwarnings.simplefilter('ignore')logging.disable(logging.WARNING)model_nm='anferico/bert-for-patents'# Load model directly
fromtransformersimportAutoModelForSequenceClassification,AutoTokenizermodel=AutoModelForSequenceClassification.from_pretrained(model_nm,num_labels=1)tokenizer=AutoTokenizer.from_pretrained('anferico/bert-for-patents')
deftok_func(x):returntokenizer(x['input'])# Tokenize all the sentences using the tokenizer
tok_ds=ds.map(tok_func,batched=True)tok_ds=tok_ds.rename_columns({'score':'labels'})
importnumpyasnpdefcorr(x,y):## change the 2-d array into 1-d array
returnnp.corrcoef(x.flatten(),y)[0,1]defcorr_d(eval_pred):return{'pearson':corr(*eval_pred)}
Top comments (0)