import re import argparse from collections import OrderedDict parser = argparse.ArgumentParser() parser.add_argument('--file', '-f', type=str, required=True, help='Bib file') parser.add_argument('--out', '-o', type=str, required=True, help='Output file') args = parser.parse_args() bibfile = args.file outfile = args.out class Entry: def __init__(self): self.type = None self.key = None self.fields = OrderedDict() def write(self, f, remove_keys, blind_keys): # use f_content as a string builder - so that we can not just add but remove characters from it f_content = '' f_content += '@%s{%s,\n' % (self.type, self.key) for key in self.fields: if key in remove_keys: continue val = self.fields[key] if key in blind_keys: f_content += ' %%%%%s = {%s},\n' % (key, val) else: f_content += ' %s = {%s},\n' % (key, val) # Remove the trailing comma before ending the entry with a }. # So that each entry ends nicely as '}}' rather than '},}', might not be valid for bibtex f_content = f_content[:-2] f_content += '\n}\n' f.write(f_content) entries = dict() def clean(infile, outfile): """ Clean the bib file generated by Mendeley to remove unnecessary fields that might messes it up :param infile: raw mendeley export :param outfile: cleaned bib file :return: None """ # Fields to be removed remove_keys = ['annote', 'annotate', 'abstract', 'mendeley-groups', 'keywords', 'file'] # Fields to be commented out - so that bibtex will ignore it blind_keys = ['issn', 'isbn', 'doi'] bibf = open(infile, 'r') cleanf = open(outfile, 'w') line = bibf.readline() while line: m = re.search('@(.*)\{(.*),.*', line) if m: entry = Entry() entry.type = m.group(1) entry.key = m.group(2) line = bibf.readline() while not re.search('^\}$', line): m = re.search('(.*)\s+=\s+\{(.*)\}.*', line) if m: entry.fields[m.group(1)] = m.group(2) line = bibf.readline() if entry.key not in entries: entries[entry.key] = entry line = bibf.readline() for key in entries: entry = entries[key] entry.write(cleanf, remove_keys, blind_keys) cleanf.close() clean(bibfile, outfile)