voc格式数据集xml文件操作

1. 介绍

包括:移动,复制,检查空文件、无目标文件,替换类别名等。可通过继承BaseOp类,轻松扩展其他功能。

2. 例子

# 例子-复制, 将/my_data下所有的xml文件(包括子文件夹下的)复制到 my_data_new目录下
python VocXMLOps.py --root /home/my_data --recursion --num_processes 5 copy --to_root /home/my_data_new/

# 例子-替换类别名, 将/my_data当前目录下的xml文件中的 'Person'类别名替换为'人'
python VocXMLOps.py --root /home/my_data replace --old_name Person --new_name 人

3. 代码

import xml.etree.ElementTree as ET
import os
import shutil
from multiprocessing import Pool


class BaseOp(object):
    def __init__(self):
        """
        操作类基类
        """
        pass

    def run(self, file_path):
        raise NotImplementedError


class IsInvaild(BaseOp):
    def __init__(self, remove=False):
        """
        查找无效或无目标的xml文件
        """
        super(IsInvaild, self).__init__()
        self.remove = remove

    def run(self, file_path):
        try:
            tree = ET.parse(file_path)
            objs = tree.findall('object')
            if len(objs) < 1:
                if self.remove:
                    os.remove(file_path)
                    print('Found and remove no object xml file : {}'.format(file_path))
                else:
                    print('Found no object xml file : {}'.format(file_path))

        except:
            if self.remove:
                try:
                    os.remove(file_path)
                    print('Found and remove invaild xml file : {}'.format(file_path))
                except:
                    print('Found invaild xml file : {}, but remove failed.'.format(file_path))
            else:
                print('Found invaild xml file : {}'.format(file_path))


class DelObject(BaseOp):
    def __init__(self, del_object_names:list):
        super(DelObject, self).__init__()
        self.del_object_names = del_object_names

    def run(self, file_path):
        try:
            tree = ET.parse(file_path)
            root = tree.getroot()
            objs = tree.findall('object')
            for obj in objs:
                name = obj.find('name').text
                if name in self.del_object_names:
                    root.remove(obj)

            tree.write(file_path)
            print("Del objs: {} from {}".format(self.del_object_names, file_path))
        except:
            pass


class MoveOp(BaseOp):
    def __init__(self, to_root):
        """
        移动操作
        :param to_root:
        """
        super(MoveOp, self).__init__()
        self.to_root = to_root

    def run(self, file_path):
        try:
            to_path = os.path.join(self.to_root, os.path.split(file_path)[-1])
            shutil.move(file_path, os.path.join(self.to_root, os.path.split(file_path)[-1]))
            print("Move {} to {}".format(file_path, to_path))
        except:
            pass


class CopyOp(BaseOp):
    def __init__(self, to_root):
        """
        复制操作
        :param to_root:
        """
        super(CopyOp, self).__init__()
        self.to_root = to_root

    def run(self, file_path):
        try:
            to_path = os.path.join(self.to_root, os.path.split(file_path)[-1])
            shutil.copy(file_path, to_path)
            print("Copy {} to {}".format(file_path, to_path))
        except:
            pass


class ReplaceNameOp(BaseOp):
    def __init__(self, old_name, new_name):
        """
        xml类别名称重命名操作
        :param old_name:
        :param new_name:
        """
        super(ReplaceNameOp, self).__init__()
        self.old_name = old_name
        self.new_name = new_name

    def run(self, file_path):
        try:
            tree = ET.parse(file_path)
            objs = tree.findall('object')
            for obj in objs:
                name = obj.find('name').text
                if name == self.old_name:
                    obj.find('name').text = self.new_name
            tree.write(file_path)
            print("Replace {}".format(file_path))
        except:
            pass


class VOCParser(object):
    def __init__(self, root:str, recursion=False, num_processes=10):
        """

        :param root:        根目录
        :param recursion:   是否递归处理所有子文件夹
        """
        self.root = root
        self.ops = []
        self.xmls = []
        self.jpgs = []

        self.pool = Pool(processes=num_processes)

        self.find_xml(self.root, recursion)

    def register(self, op:BaseOp):
        self.ops.append(op)

    def find_xml(self, root, recursion):
        fs = os.listdir(root)
        for f in fs:
            f = os.path.join(root, f)
            if os.path.isdir(f) and recursion:
                self.find_xml(f, recursion)
            if os.path.isfile(f):
                if f.endswith('.xml'):
                    self.xmls.append(f)
                else:
                    pass

    def run(self):
        for xml in self.xmls:
            for op in self.ops:
                self.pool.apply_async(op.run, (xml,))

        self.pool.close()
        self.pool.join()


if __name__ == '__main__':
    import argparse

    parse = argparse.ArgumentParser(description='VOC格式数据集,xml文件操作.')
    parse.add_argument('--root', required=True, help='文件根目录.')
    parse.add_argument('--recursion', action='store_true', help='否定递归处理所有子文件夹中的xml文件.')
    parse.add_argument('--num_processes', default=10, type=int, help='进程数.')

    subparser = parse.add_subparsers(dest='subparser_name', description='子命令')
    invaild = subparser.add_parser('invaild', help='查找无效或无目标的xml文件', description='查找无效或无目标的xml文件')
    invaild.add_argument('--remove', action='store_true', help='移除无效文件')

    replace = subparser.add_parser('replace', help='替换xml类别名', description='替换xml类别名')
    replace.add_argument('--old_name', required=True, help='旧类别名')
    replace.add_argument('--new_name', required=True, help='新类别名')

    delobjects = subparser.add_parser('delobjects', help='删除xml类别.可同时删除多个类别,多个类别间用空格隔开', description='删除xml类别')
    delobjects.add_argument('--del_objects', nargs='+', help='删除的类别名')

    move = subparser.add_parser('move', help='移动xml文件', description='移动xml文件')
    move.add_argument('--to_root', required=True, help='目标文件夹.')

    copy = subparser.add_parser('copy', help='复制xml文件', description='复制xml文件')
    copy.add_argument('--to_root', required=True, help='目标文件夹.')

    args = parse.parse_args()

    vocparser = VOCParser(args.root, args.recursion, args.num_processes)

    if args.subparser_name == 'invaild':
        vocparser.register(IsInvaild(args.remove))

    elif args.subparser_name == 'replace':
        vocparser.register(ReplaceNameOp(args.old_name, args.new_name))

    elif args.subparser_name == 'move':
        vocparser.register(MoveOp(args.to_root))

    elif args.subparser_name == 'copy':
        vocparser.register(CopyOp(args.to_root))

    elif args.subparser_name == 'delobjects':
        vocparser.register(DelObject(args.del_objects))

    vocparser.run()