49  List files

49.1 Methods

Note

Q: Show me ways to recursively list files and folders path from a given directory (e.g., input/) which matches certain pattern (glob, regex).

I’ll show you several ways to recursively list files and folders with pattern matching in Python. Each method has its strengths.

49.1.1 Directory Structure Example

input/
├── data/
│   ├── file1.txt
│   ├── file2.csv
│   └── raw/
│       └── data.json
├── scripts/
│   ├── main.py
│   └── utils.py
└── README.md

49.1.2 Method 1: Using pathlib (Modern & Pythonic)

from pathlib import Path

def list_files_pathlib(directory, pattern):
    """Recursively list files matching glob pattern using pathlib."""
    path = Path(directory)
    return list(path.rglob(pattern))

# Examples
files = list_files_pathlib('input/', '*.txt')
files = list_files_pathlib('input/', '**/*.py')  # All .py files
files = list_files_pathlib('input/', '**/data*')  # Files/dirs containing 'data'

49.1.3 Method 2: Using glob Module

import glob

def list_files_glob(directory, pattern, recursive=True):
    """Recursively list files matching glob pattern."""
    search_pattern = f"{directory}/**/{pattern}" if recursive else f"{directory}/{pattern}"
    return glob.glob(search_pattern, recursive=recursive)

# Examples
files = list_files_glob('input/', '*.txt')
files = list_files_glob('input/', '*.{txt,csv,json}')  # Multiple extensions

49.1.4 Method 3: Using os.walk() with Regex

import os
import re

def list_files_regex(directory, pattern):
    """Recursively list files matching regex pattern."""
    regex = re.compile(pattern)
    matches = []
    
    for root, dirs, files in os.walk(directory):
        for file in files:
            if regex.search(file):
                matches.append(os.path.join(root, file))
        # Also check directory names if needed
        for dir_name in dirs:
            if regex.search(dir_name):
                matches.append(os.path.join(root, dir_name))
    
    return matches

# Examples
files = list_files_regex('input/', r'\.txt$')  # Files ending with .txt
files = list_files_regex('input/', r'^data')   # Files/dirs starting with 'data'
files = list_files_regex('input/', r'\.(txt|csv|json)$')  # Multiple extensions

49.1.5 Method 4: Using fnmatch for Shell-style Patterns

import os
import fnmatch

def list_files_fnmatch(directory, pattern):
    """Recursively list files matching shell-style pattern."""
    matches = []
    
    for root, dirs, files in os.walk(directory):
        for filename in files:
            if fnmatch.fnmatch(filename, pattern):
                matches.append(os.path.join(root, filename))
    
    return matches

# Examples
files = list_files_fnmatch('input/', '*.txt')
files = list_files_fnmatch('input/', 'data*')  # Files starting with 'data'
files = list_files_fnmatch('input/', '*[0-9].*')  # Files with numbers

49.1.6 Pattern Types Comparison

GLOB PATTERNS           REGEX PATTERNS
─────────────          ───────────────
*.txt                  \.txt$
file?.txt              file.\.txt
file[0-9].txt          file[0-9]\.txt
**/*.txt               .*\.txt$
data*/file*            ^data.*file.*

Performance Tips

  1. pathlib.rglob() - Most Pythonic, good performance
  2. glob.glob() - Fast for simple patterns
  3. os.walk() + regex - Most flexible but slower
  4. fnmatch - Good for shell-style patterns

Choose based on your needs: - Simple glob patterns: Use pathlib - Complex regex matching: Use os.walk() + re - Shell-style patterns: Use fnmatch - Maximum flexibility: Combine methods in a class

49.2 List files Multiple Extension

49.2.1 Method 1B: Using Set Union

from pathlib import Path

def list_files_union(directory, extensions):
    """List files using set union to avoid duplicates."""
    path = Path(directory)
    files = set()
    
    for ext in extensions:
        files.update(path.rglob(f'*.{ext}'))
    
    return list(files)

# Usage
files = list_files_union('input/', ['docx', 'pdf'])

49.2.2 Method 1D: Filter with Suffix Check

from pathlib import Path

def list_files_suffix(directory, extensions):
    """List all files then filter by suffix."""
    path = Path(directory)
    extensions = [f'.{ext}' if not ext.startswith('.') else ext 
                  for ext in extensions]
    
    return [f for f in path.rglob('*') 
            if f.is_file() and f.suffix in extensions]

# Usage
files = list_files_suffix('input/', ['docx', 'pdf', '.txt'])

49.2.3 One-Liner

from pathlib import Path

# Get all documents (multiple extensions)
docs = [f for f in Path('input/').rglob('*') 
        if f.suffix in ['.pdf', '.docx', '.txt']]

# Get all code files
code = [f for f in Path('input/').rglob('*') 
        if f.suffix in ['.py', '.js', '.html', '.css']]

# Case-insensitive matching
images = [f for f in Path('input/').rglob('*') 
          if f.suffix.lower() in ['.jpg', '.jpeg', '.png', '.gif']]

49.2.4 Combined

from pathlib import Path
from typing import List, Union

def find_files(directory: str, 
               extensions: List[str] = None,
               pattern: str = None) -> List[Path]:
    """Flexible file finder with pathlib."""
    path = Path(directory)
    
    if pattern:
        # Use custom pattern
        return list(path.rglob(pattern))
    
    if extensions:
        # Normalize extensions (add . if missing)
        exts = {f'.{e}' if not e.startswith('.') else e 
                for e in extensions}
        # Single scan, filter by suffix
        return [f for f in path.rglob('*') 
                if f.is_file() and f.suffix.lower() in exts]
    
    # Return all files
    return [f for f in path.rglob('*') if f.is_file()]

# Usage
documents = find_files('input/', extensions=['pdf', 'docx'])
configs = find_files('input/', pattern='*.config')