1. 介绍
包括:移动,复制,检查空文件、无目标文件,替换类别名等。可通过继承BaseOp类,轻松扩展其他功能。













2. 例子
# 例子-复制, 将/my_data下所有的xml文件(包括子文件夹下的)复制到 my_data_new目录下
python VocXMLOps.py --root /home/my_data --recursion --num_processes 5 copy --to_root /home/my_data_new/
# 例子-替换类别名, 将/my_data当前目录下的xml文件中的 'Person'类别名替换为'人'
python VocXMLOps.py --root /home/my_data replace --old_name Person --new_name 人
3. 代码
import xml.etree.ElementTree as ET
import os
import shutil
from multiprocessing import Pool
class BaseOp(object):
def __init__(self):
"""
操作类基类
"""
pass
def run(self, file_path):
raise NotImplementedError
class IsInvaild(BaseOp):
def __init__(self, remove=False):
"""
查找无效或无目标的xml文件
"""
super(IsInvaild, self).__init__()
self.remove = remove
def run(self, file_path):
try:
tree = ET.parse(file_path)
objs = tree.findall('object')
if len(objs) < 1:
if self.remove:
os.remove(file_path)
print('Found and remove no object xml file : {}'.format(file_path))
else:
print('Found no object xml file : {}'.format(file_path))
except:
if self.remove:
try:
os.remove(file_path)
print('Found and remove invaild xml file : {}'.format(file_path))
except:
print('Found invaild xml file : {}, but remove failed.'.format(file_path))
else:
print('Found invaild xml file : {}'.format(file_path))
class DelObject(BaseOp):
def __init__(self, del_object_names:list):
super(DelObject, self).__init__()
self.del_object_names = del_object_names
def run(self, file_path):
try:
tree = ET.parse(file_path)
root = tree.getroot()
objs = tree.findall('object')
for obj in objs:
name = obj.find('name').text
if name in self.del_object_names:
root.remove(obj)
tree.write(file_path)
print("Del objs: {} from {}".format(self.del_object_names, file_path))
except:
pass
class MoveOp(BaseOp):
def __init__(self, to_root):
"""
移动操作
:param to_root:
"""
super(MoveOp, self).__init__()
self.to_root = to_root
def run(self, file_path):
try:
to_path = os.path.join(self.to_root, os.path.split(file_path)[-1])
shutil.move(file_path, os.path.join(self.to_root, os.path.split(file_path)[-1]))
print("Move {} to {}".format(file_path, to_path))
except:
pass
class CopyOp(BaseOp):
def __init__(self, to_root):
"""
复制操作
:param to_root:
"""
super(CopyOp, self).__init__()
self.to_root = to_root
def run(self, file_path):
try:
to_path = os.path.join(self.to_root, os.path.split(file_path)[-1])
shutil.copy(file_path, to_path)
print("Copy {} to {}".format(file_path, to_path))
except:
pass
class ReplaceNameOp(BaseOp):
def __init__(self, old_name, new_name):
"""
xml类别名称重命名操作
:param old_name:
:param new_name:
"""
super(ReplaceNameOp, self).__init__()
self.old_name = old_name
self.new_name = new_name
def run(self, file_path):
try:
tree = ET.parse(file_path)
objs = tree.findall('object')
for obj in objs:
name = obj.find('name').text
if name == self.old_name:
obj.find('name').text = self.new_name
tree.write(file_path)
print("Replace {}".format(file_path))
except:
pass
class VOCParser(object):
def __init__(self, root:str, recursion=False, num_processes=10):
"""
:param root: 根目录
:param recursion: 是否递归处理所有子文件夹
"""
self.root = root
self.ops = []
self.xmls = []
self.jpgs = []
self.pool = Pool(processes=num_processes)
self.find_xml(self.root, recursion)
def register(self, op:BaseOp):
self.ops.append(op)
def find_xml(self, root, recursion):
fs = os.listdir(root)
for f in fs:
f = os.path.join(root, f)
if os.path.isdir(f) and recursion:
self.find_xml(f, recursion)
if os.path.isfile(f):
if f.endswith('.xml'):
self.xmls.append(f)
else:
pass
def run(self):
for xml in self.xmls:
for op in self.ops:
self.pool.apply_async(op.run, (xml,))
self.pool.close()
self.pool.join()
if __name__ == '__main__':
import argparse
parse = argparse.ArgumentParser(description='VOC格式数据集,xml文件操作.')
parse.add_argument('--root', required=True, help='文件根目录.')
parse.add_argument('--recursion', action='store_true', help='否定递归处理所有子文件夹中的xml文件.')
parse.add_argument('--num_processes', default=10, type=int, help='进程数.')
subparser = parse.add_subparsers(dest='subparser_name', description='子命令')
invaild = subparser.add_parser('invaild', help='查找无效或无目标的xml文件', description='查找无效或无目标的xml文件')
invaild.add_argument('--remove', action='store_true', help='移除无效文件')
replace = subparser.add_parser('replace', help='替换xml类别名', description='替换xml类别名')
replace.add_argument('--old_name', required=True, help='旧类别名')
replace.add_argument('--new_name', required=True, help='新类别名')
delobjects = subparser.add_parser('delobjects', help='删除xml类别.可同时删除多个类别,多个类别间用空格隔开', description='删除xml类别')
delobjects.add_argument('--del_objects', nargs='+', help='删除的类别名')
move = subparser.add_parser('move', help='移动xml文件', description='移动xml文件')
move.add_argument('--to_root', required=True, help='目标文件夹.')
copy = subparser.add_parser('copy', help='复制xml文件', description='复制xml文件')
copy.add_argument('--to_root', required=True, help='目标文件夹.')
args = parse.parse_args()
vocparser = VOCParser(args.root, args.recursion, args.num_processes)
if args.subparser_name == 'invaild':
vocparser.register(IsInvaild(args.remove))
elif args.subparser_name == 'replace':
vocparser.register(ReplaceNameOp(args.old_name, args.new_name))
elif args.subparser_name == 'move':
vocparser.register(MoveOp(args.to_root))
elif args.subparser_name == 'copy':
vocparser.register(CopyOp(args.to_root))
elif args.subparser_name == 'delobjects':
vocparser.register(DelObject(args.del_objects))
vocparser.run()