49 List files
49.1 Methods
Note
Q: Show me ways to recursively list files and folders path from a given directory (e.g., input/) which matches certain pattern (glob, regex).
I’ll show you several ways to recursively list files and folders with pattern matching in Python. Each method has its strengths.
49.1.1 Directory Structure Example
input/
├── data/
│ ├── file1.txt
│ ├── file2.csv
│ └── raw/
│ └── data.json
├── scripts/
│ ├── main.py
│ └── utils.py
└── README.md
49.1.2 Method 1: Using pathlib (Modern & Pythonic)
from pathlib import Path
def list_files_pathlib(directory, pattern):
"""Recursively list files matching glob pattern using pathlib."""
path = Path(directory)
return list(path.rglob(pattern))
# Examples
files = list_files_pathlib('input/', '*.txt')
files = list_files_pathlib('input/', '**/*.py') # All .py files
files = list_files_pathlib('input/', '**/data*') # Files/dirs containing 'data'49.1.3 Method 2: Using glob Module
import glob
def list_files_glob(directory, pattern, recursive=True):
"""Recursively list files matching glob pattern."""
search_pattern = f"{directory}/**/{pattern}" if recursive else f"{directory}/{pattern}"
return glob.glob(search_pattern, recursive=recursive)
# Examples
files = list_files_glob('input/', '*.txt')
files = list_files_glob('input/', '*.{txt,csv,json}') # Multiple extensions49.1.4 Method 3: Using os.walk() with Regex
import os
import re
def list_files_regex(directory, pattern):
"""Recursively list files matching regex pattern."""
regex = re.compile(pattern)
matches = []
for root, dirs, files in os.walk(directory):
for file in files:
if regex.search(file):
matches.append(os.path.join(root, file))
# Also check directory names if needed
for dir_name in dirs:
if regex.search(dir_name):
matches.append(os.path.join(root, dir_name))
return matches
# Examples
files = list_files_regex('input/', r'\.txt$') # Files ending with .txt
files = list_files_regex('input/', r'^data') # Files/dirs starting with 'data'
files = list_files_regex('input/', r'\.(txt|csv|json)$') # Multiple extensions49.1.5 Method 4: Using fnmatch for Shell-style Patterns
import os
import fnmatch
def list_files_fnmatch(directory, pattern):
"""Recursively list files matching shell-style pattern."""
matches = []
for root, dirs, files in os.walk(directory):
for filename in files:
if fnmatch.fnmatch(filename, pattern):
matches.append(os.path.join(root, filename))
return matches
# Examples
files = list_files_fnmatch('input/', '*.txt')
files = list_files_fnmatch('input/', 'data*') # Files starting with 'data'
files = list_files_fnmatch('input/', '*[0-9].*') # Files with numbers49.1.6 Pattern Types Comparison
GLOB PATTERNS REGEX PATTERNS
───────────── ───────────────
*.txt \.txt$
file?.txt file.\.txt
file[0-9].txt file[0-9]\.txt
**/*.txt .*\.txt$
data*/file* ^data.*file.*
Performance Tips
- pathlib.rglob() - Most Pythonic, good performance
- glob.glob() - Fast for simple patterns
- os.walk() + regex - Most flexible but slower
- fnmatch - Good for shell-style patterns
Choose based on your needs: - Simple glob patterns: Use pathlib - Complex regex matching: Use os.walk() + re - Shell-style patterns: Use fnmatch - Maximum flexibility: Combine methods in a class
49.2 List files Multiple Extension
49.2.1 Method 1B: Using Set Union
from pathlib import Path
def list_files_union(directory, extensions):
"""List files using set union to avoid duplicates."""
path = Path(directory)
files = set()
for ext in extensions:
files.update(path.rglob(f'*.{ext}'))
return list(files)
# Usage
files = list_files_union('input/', ['docx', 'pdf'])49.2.2 Method 1D: Filter with Suffix Check
from pathlib import Path
def list_files_suffix(directory, extensions):
"""List all files then filter by suffix."""
path = Path(directory)
extensions = [f'.{ext}' if not ext.startswith('.') else ext
for ext in extensions]
return [f for f in path.rglob('*')
if f.is_file() and f.suffix in extensions]
# Usage
files = list_files_suffix('input/', ['docx', 'pdf', '.txt'])49.2.3 One-Liner
from pathlib import Path
# Get all documents (multiple extensions)
docs = [f for f in Path('input/').rglob('*')
if f.suffix in ['.pdf', '.docx', '.txt']]
# Get all code files
code = [f for f in Path('input/').rglob('*')
if f.suffix in ['.py', '.js', '.html', '.css']]
# Case-insensitive matching
images = [f for f in Path('input/').rglob('*')
if f.suffix.lower() in ['.jpg', '.jpeg', '.png', '.gif']]49.2.4 Combined
from pathlib import Path
from typing import List, Union
def find_files(directory: str,
extensions: List[str] = None,
pattern: str = None) -> List[Path]:
"""Flexible file finder with pathlib."""
path = Path(directory)
if pattern:
# Use custom pattern
return list(path.rglob(pattern))
if extensions:
# Normalize extensions (add . if missing)
exts = {f'.{e}' if not e.startswith('.') else e
for e in extensions}
# Single scan, filter by suffix
return [f for f in path.rglob('*')
if f.is_file() and f.suffix.lower() in exts]
# Return all files
return [f for f in path.rglob('*') if f.is_file()]
# Usage
documents = find_files('input/', extensions=['pdf', 'docx'])
configs = find_files('input/', pattern='*.config')