49 List files
49.1 Methods
Note
Q: Show me ways to recursively list files and folders path from a given directory (e.g., input/) which matches certain pattern (glob, regex).
I’ll show you several ways to recursively list files and folders with pattern matching in Python. Each method has its strengths.
49.1.1 Directory Structure Example
input/
├── data/
│ ├── file1.txt
│ ├── file2.csv
│ └── raw/
│ └── data.json
├── scripts/
│ ├── main.py
│ └── utils.py
└── README.md
49.1.2 Method 1: Using pathlib
(Modern & Pythonic)
from pathlib import Path
def list_files_pathlib(directory, pattern):
"""Recursively list files matching glob pattern using pathlib."""
= Path(directory)
path return list(path.rglob(pattern))
# Examples
= list_files_pathlib('input/', '*.txt')
files = list_files_pathlib('input/', '**/*.py') # All .py files
files = list_files_pathlib('input/', '**/data*') # Files/dirs containing 'data' files
49.1.3 Method 2: Using glob
Module
import glob
def list_files_glob(directory, pattern, recursive=True):
"""Recursively list files matching glob pattern."""
= f"{directory}/**/{pattern}" if recursive else f"{directory}/{pattern}"
search_pattern return glob.glob(search_pattern, recursive=recursive)
# Examples
= list_files_glob('input/', '*.txt')
files = list_files_glob('input/', '*.{txt,csv,json}') # Multiple extensions files
49.1.4 Method 3: Using os.walk()
with Regex
import os
import re
def list_files_regex(directory, pattern):
"""Recursively list files matching regex pattern."""
= re.compile(pattern)
regex = []
matches
for root, dirs, files in os.walk(directory):
for file in files:
if regex.search(file):
file))
matches.append(os.path.join(root, # Also check directory names if needed
for dir_name in dirs:
if regex.search(dir_name):
matches.append(os.path.join(root, dir_name))
return matches
# Examples
= list_files_regex('input/', r'\.txt$') # Files ending with .txt
files = list_files_regex('input/', r'^data') # Files/dirs starting with 'data'
files = list_files_regex('input/', r'\.(txt|csv|json)$') # Multiple extensions files
49.1.5 Method 4: Using fnmatch
for Shell-style Patterns
import os
import fnmatch
def list_files_fnmatch(directory, pattern):
"""Recursively list files matching shell-style pattern."""
= []
matches
for root, dirs, files in os.walk(directory):
for filename in files:
if fnmatch.fnmatch(filename, pattern):
matches.append(os.path.join(root, filename))
return matches
# Examples
= list_files_fnmatch('input/', '*.txt')
files = list_files_fnmatch('input/', 'data*') # Files starting with 'data'
files = list_files_fnmatch('input/', '*[0-9].*') # Files with numbers files
49.1.6 Pattern Types Comparison
GLOB PATTERNS REGEX PATTERNS
───────────── ───────────────
*.txt \.txt$
file?.txt file.\.txt
file[0-9].txt file[0-9]\.txt
**/*.txt .*\.txt$
data*/file* ^data.*file.*
Performance Tips
- pathlib.rglob() - Most Pythonic, good performance
- glob.glob() - Fast for simple patterns
- os.walk() + regex - Most flexible but slower
- fnmatch - Good for shell-style patterns
Choose based on your needs: - Simple glob patterns: Use pathlib
- Complex regex matching: Use os.walk()
+ re
- Shell-style patterns: Use fnmatch
- Maximum flexibility: Combine methods in a class
49.2 List files Multiple Extension
49.2.1 Method 1B: Using Set Union
from pathlib import Path
def list_files_union(directory, extensions):
"""List files using set union to avoid duplicates."""
= Path(directory)
path = set()
files
for ext in extensions:
f'*.{ext}'))
files.update(path.rglob(
return list(files)
# Usage
= list_files_union('input/', ['docx', 'pdf']) files
49.2.2 Method 1D: Filter with Suffix Check
from pathlib import Path
def list_files_suffix(directory, extensions):
"""List all files then filter by suffix."""
= Path(directory)
path = [f'.{ext}' if not ext.startswith('.') else ext
extensions for ext in extensions]
return [f for f in path.rglob('*')
if f.is_file() and f.suffix in extensions]
# Usage
= list_files_suffix('input/', ['docx', 'pdf', '.txt']) files
49.2.3 One-Liner
from pathlib import Path
# Get all documents (multiple extensions)
= [f for f in Path('input/').rglob('*')
docs if f.suffix in ['.pdf', '.docx', '.txt']]
# Get all code files
= [f for f in Path('input/').rglob('*')
code if f.suffix in ['.py', '.js', '.html', '.css']]
# Case-insensitive matching
= [f for f in Path('input/').rglob('*')
images if f.suffix.lower() in ['.jpg', '.jpeg', '.png', '.gif']]
49.2.4 Combined
from pathlib import Path
from typing import List, Union
def find_files(directory: str,
str] = None,
extensions: List[str = None) -> List[Path]:
pattern: """Flexible file finder with pathlib."""
= Path(directory)
path
if pattern:
# Use custom pattern
return list(path.rglob(pattern))
if extensions:
# Normalize extensions (add . if missing)
= {f'.{e}' if not e.startswith('.') else e
exts for e in extensions}
# Single scan, filter by suffix
return [f for f in path.rglob('*')
if f.is_file() and f.suffix.lower() in exts]
# Return all files
return [f for f in path.rglob('*') if f.is_file()]
# Usage
= find_files('input/', extensions=['pdf', 'docx'])
documents = find_files('input/', pattern='*.config') configs