parse directory
Here are the examples of how to parse directory in python. These are taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
3 Examples 7
def parseDirectory(fname, validatedDigests=None): """DOCDOC""" try: s = readPossiblyGzippedFile(fname) except (IOError, zlib.error), e: raise ConfigError("Couldn't decompress %s: %s"%(fname,e)) if s.startswith("[Directory]\n"): tp = ServerDirectory else: tp = SignedDirectory return tp(fname=fname, string=s, validatedDigests=validatedDigests)
def parse_directory(self, in_path, out_path): if not out_path: out_path = os.path.join(os.path.abspath(in_path), 'out') self.log.info("Out path wasn't set. Setting it to ".format(out_path)) if not os.path.exists(out_path): os.mkdir(out_path) self.log.info("Out path didn't exist. Creating ".format(out_path)) out_grants = open(os.path.join(out_path, 'grants.out'), 'w') out_contracts = open(os.path.join(out_path, 'contracts.out'), 'w') self.log.info("Looking for input files. ") for file in os.listdir(in_path): file_path = os.path.join(in_path, file) self.log.info(" Found ".format(file_path)) if os.path.isfile(file_path): input = open(file_path, 'rb') self.log.info(" Converting . ".format(file_path)) if self.re_contracts.match(file): self.parse_file(input, out_contracts, fpds.FIELDS, CONTRACT_STRINGS, fpds.CALCULATED_FIELDS) else: self.parse_file(input, out_grants, faads.FIELDS, GRANT_STRINGS, faads.CALCULATED_FIELDS) input.close() out_grants.close() out_contracts.close() self.log.info("Done with input files.")
def parse_directory(path, **kwargs): logfile = initialize_logfile(kwargs['logdir']) for file in os.listdir(path): print file print path # we don't process the daily digest or front matter. if file.find('FrontMatter') != -1 or file.find('PgD') != -1: continue # Makes text versions for the parser elif file.endswith('.htm'): old_file = os.path.join(path, file) content = open(old_file, 'r').read() # eliminates extra title and leaves expected space at the top content = re.sub(r'', '', content) # need to eliminate particular blank lines, should sill get the tags out if expected line breaks aren't there. extras = ['\n','', '', '\n', '\n', '', '', '
\n', '', '', '
','', ] for tag in extras: content = content.replace(tag, '') new_name = file[:-3] + 'txt' new_path = os.path.join(path, new_name) text_doc = open(new_path, 'w') text_doc = text_doc.write(content) file = new_name os.remove(old_file) if not file.endswith('.txt'): continue if kwargs.get('interactive', False): resp = raw_input("process file %s? (y/n/q) " % file) if resp == 'n': print 'skipping\n' continue elif resp == 'q': sys.exit() abspath = os.path.join(path, file) try: del kwargs['interactive'] except: pass try: del kwargs['logdir'] except: pass parser = CRParser(abspath, **kwargs) do_parse(parser, logfile) return kwargs['outdir']Parsing every file in a directory in python?
That will parse the xml file test.xml and print the parsed output. However, I have a huge number of these xml files that need parsing in a directory. How can I modify the code so that it goes through every file in the directory and applies this function to it? Thanks!
2 Answers 2
It returns a list of all files in the dir.
import xml.etree.ElementTree as ET import os listofxml = os.listdir("./") for xml in listofxml: tree = ET.parse(xml) root = tree.getroot() for segment in root.iter("s"): for word in segment.iter("w"): print word.text, print "\n"
And if not all the files are xml, then you can split and check:
import xml.etree.ElementTree as ET import os listofxml = os.listdir("./") for xml in listofxml: format = xml.split('.') if format[-1] == 'xml': tree = ET.parse(xml) root = tree.getroot() for segment in root.iter("s"): for word in segment.iter("w"): print word.text, print "\n"
def printParsed(filename): tree = ET.parse(filename) root = tree.getroot() for segment in root.iter("s"): for word in segment.iter("w"): print word.text, print "\n" if __name__ == "__main__": from os import listdir from os.path import isfile, join mypath ='path/to/your/xml/files' onlyfiles = [ f for f in listdir(mypath) if isfile(join(mypath,f)) ] for f in onlyfiles: # only does stuff if the file ends in xml if f[-3:] = '.xml': printParsed(f)
You would save the file, say as parser.py , and then run it like python parser.py . You can also drop the if __name__ == "__main__" section if you want.
When I run this, I get the following: Traceback (most recent call last): File "xmlparse.py", line 20, in
printParsed(f) File "xmlparse.py", line 6, in printParsed tree = ET.parse(name) NameError: global name 'name' is not defined O I removed the if f[-3:] = '.xml' ": line because all the files are xml anyway. How to parse a directory structure into dictionary?
This structure seems to be quite inconsistent: If there is no subdirectory, the value is None . If there is one subdirectory, the value is a dictionary. If there are more than one subdirectories, the value is a list. And all occurring dictionaries only have a single item. A nested dictionary without any lists and None s would seem far more appropriate.
3 Answers 3
lst = ['/a/b', '/a/b/c', '/a/b/c/d', '/a/b/c/e', '/a/b/c/f/g', '/a/b/c/f/h', '/a/b/c/f/i'] dct = <> for item in lst: p = dct for x in item.split('/'): p = p.setdefault(x, <>) print dct
this is not exactly your structure, but should give you a basic idea.
As Sven Marnach said, the output data structure should be more consistent, eg only nested dictionaries where folders are associated to dict and files to None.
Here is a script which uses os.walk. It does not take a list as input but should do what you want in the end if you want to parse files.
import os from pprint import pprint def set_leaf(tree, branches, leaf): """ Set a terminal element to *leaf* within nested dictionaries. *branches* defines the path through dictionnaries. Example: >>> t = <> >>> set_leaf(t, ['b1','b2','b3'], 'new_leaf') >>> print t >> """ if len(branches) == 1: tree[branches[0]] = leaf return if not tree.has_key(branches[0]): tree[branches[0]] = <> set_leaf(tree[branches[0]], branches[1:], leaf) startpath = '.' tree = <> for root, dirs, files in os.walk(startpath): branches = [startpath] if root != startpath: branches.extend(os.path.relpath(root, startpath).split('/')) set_leaf(tree, branches, dict([(d,<>) for d in dirs]+ \ [(f,None) for f in files])) print 'tree:' pprint(tree)
Parsing all XML files in directory and all subdirectories
I am new to Python, yet I have some experience with Delphi. I am trying to make a script that would be able to search all xml files in directory (including all subdirectories in that directory), then parse those XML and save some data (numbers) from there to a simple txt file. After that I work through that txt file to create another txt file with only unique set of numbers from previously created txt file. I created this script:
import os from xml.dom import minidom #for testing purposes directory = os.getcwd() print("Procházím aktuální adresář, hledám XML soubory. ") print("Procházím XML soubory, hledám IČP provádějícího. ") with open ('ICP_all.txt', 'w') as SeznamICP_all: for root, dirs, files in os.walk(directory): for file in files: if (file.endswith('.xml')): xmldoc = minidom.parse(file) itemlist = xmldoc.getElementsByTagName('is') SeznamICP_all.write(itemlist[0].attributes['icp'].value + '\n') print("Vytvářím list unikátních IČP. ") with open ('ICP_distinct.txt','w') as distinct: UnikatniICP = [] with open ('ICP_all.txt','r') as SeznamICP_all: for line in SeznamICP_all: if line not in UnikatniICP: UnikatniICP.append(line) distinct.write(line) print('Počet unikátních IČP:' + str(len(UnikatniICP))) input('Pro ukončení stiskni libovolnou klávesu. ')
FileNotFoundError: [Errno 2] No such file or directory: 'RNN38987.xml'
That is caused by the fact that file is in subdirectory, not in a directory with python script. I tried to make it work via path to get absolute path of the file to work with, but I am getting more error, see the script:
import os from xml.dom import minidom from pathlib import Path #for testing purposes directory = os.getcwd() print("Procházím aktuální adresář, hledám XML soubory. ") print("Procházím XML soubory, hledám IČP provádějícího. ") with open ('ICP_all.txt', 'w') as SeznamICP_all: for root, dirs, files in os.walk(directory): for file in files: if (file.endswith('.xml')): soubor = Path(file).resolve() print(soubor) xmldoc = minidom.parse(soubor) itemlist = xmldoc.getElementsByTagName('is') SeznamICP_all.write(itemlist[0].attributes['icp'].value + '\n') print("Vytvářím list unikátních IČP. ") with open ('ICP_distinct.txt','w') as distinct: UnikatniICP = [] with open ('ICP_all.txt','r') as SeznamICP_all: for line in SeznamICP_all: if line not in UnikatniICP: UnikatniICP.append(line) distinct.write(line) print('Počet unikátních IČP:' + str(len(UnikatniICP))) input('Pro ukončení stiskni libovolnou klávesu. ')
Procházím aktuální adresář, hledám XML soubory. Procházím XML soubory, hledám IČP provádějícího. C:\2_Programming\Python\IČP FINDER\src\20150225_1815_2561_1.xml Traceback (most recent call last): File "C:\2_Programming\Python\IČP FINDER\src\ICP Finder.py", line 17, in xmldoc = minidom.parse(soubor) File "C:\2_Programming\Python\Interpreter\lib\xml\dom\minidom.py", line 1958, in parse return expatbuilder.parse(file) File "C:\2_Programming\Python\Interpreter\lib\xml\dom\expatbuilder.py", line 913, in parse result = builder.parseFile(file) File "C:\2_Programming\Python\Interpreter\lib\xml\dom\expatbuilder.py", line 204, in parseFile buffer = file.read(16*1024) AttributeError: 'WindowsPath' object has no attribute 'read'