#!/usr/bin/env python3
"""
Substructure Filter
Filter molecules based on substructure patterns using SMARTS.
Supports inclusion and exclusion filters, and custom pattern libraries.
Usage:
python substructure_filter.py molecules.smi --pattern "c1ccccc1" --output filtered.smi
python substructure_filter.py database.sdf --exclude "C(=O)Cl" --filter-type functional-groups
"""
import argparse
import sys
from pathlib import Path
try:
from rdkit import Chem
except ImportError:
print("Error: RDKit not installed. Install with: conda install -c conda-forge rdkit")
sys.exit(1)
# Common SMARTS pattern libraries
PATTERN_LIBRARIES = {
'functional-groups': {
'alcohol': '[OH][C]',
'aldehyde': '[CH1](=O)',
'ketone': '[C](=O)[C]',
'carboxylic_acid': 'C(=O)[OH]',
'ester': 'C(=O)O[C]',
'amide': 'C(=O)N',
'amine': '[NX3]',
'ether': '[C][O][C]',
'nitrile': 'C#N',
'nitro': '[N+](=O)[O-]',
'halide': '[C][F,Cl,Br,I]',
'thiol': '[C][SH]',
'sulfide': '[C][S][C]',
},
'rings': {
'benzene': 'c1ccccc1',
'pyridine': 'n1ccccc1',
'pyrrole': 'n1cccc1',
'furan': 'o1cccc1',
'thiophene': 's1cccc1',
'imidazole': 'n1cncc1',
'indole': 'c1ccc2[nH]ccc2c1',
'naphthalene': 'c1ccc2ccccc2c1',
},
'pains': {
'rhodanine': 'S1C(=O)NC(=S)C1',
'catechol': 'c1ccc(O)c(O)c1',
'quinone': 'O=C1C=CC(=O)C=C1',
'michael_acceptor': 'C=CC(=O)',
'alkyl_halide': '[C][I,Br]',
},
'privileged': {
'biphenyl': 'c1ccccc1-c2ccccc2',
'piperazine': 'N1CCNCC1',
'piperidine': 'N1CCCCC1',
'morpholine': 'N1CCOCC1',
}
}
def load_molecules(file_path, keep_props=True):
"""Load molecules from file."""
path = Path(file_path)
if not path.exists():
print(f"Error: File not found: {file_path}")
return []
molecules = []
if path.suffix.lower() in ['.sdf', '.mol']:
suppl = Chem.SDMolSupplier(str(path))
elif path.suffix.lower() in ['.smi', '.smiles', '.txt']:
suppl = Chem.SmilesMolSupplier(str(path), titleLine=False)
else:
print(f"Error: Unsupported file format: {path.suffix}")
return []
for idx, mol in enumerate(suppl):
if mol is None:
print(f"Warning: Failed to parse molecule {idx+1}")
continue
molecules.append(mol)
return molecules
def create_pattern_query(pattern_string):
"""Create SMARTS query from string or SMILES."""
# Try as SMARTS first
query = Chem.MolFromSmarts(pattern_string)
if query is not None:
return query
# Try as SMILES
query = Chem.MolFromSmiles(pattern_string)
if query is not None:
return query
print(f"Error: Invalid pattern: {pattern_string}")
return None
def filter_molecules(molecules, include_patterns=None, exclude_patterns=None,
match_all_include=False):
"""
Filter molecules based on substructure patterns.
Args:
molecules: List of RDKit Mol objects
include_patterns: List of (name, pattern) tuples to include
exclude_patterns: List of (name, pattern) tuples to exclude
match_all_include: If True, molecule must match ALL include patterns
Returns:
Tuple of (filtered_molecules, match_info)
"""
filtered = []
match_info = []
for idx, mol in enumerate(molecules):
if mol is None:
continue
# Check exclusion patterns first
excluded = False
exclude_matches = []
if exclude_patterns:
for name, pattern in exclude_patterns:
if mol.HasSubstructMatch(pattern):
excluded = True
exclude_matches.append(name)
if excluded:
match_info.append({
'index': idx + 1,
'smiles': Chem.MolToSmiles(mol),
'status': 'excluded',
'matches': exclude_matches
})
continue
# Check inclusion patterns
if include_patterns:
include_matches = []
for name, pattern in include_patterns:
if mol.HasSubstructMatch(pattern):
include_matches.append(name)
# Decide if molecule passes inclusion filter
if match_all_include:
passed = len(include_matches) == len(include_patterns)
else:
passed = len(include_matches) > 0
if passed:
filtered.append(mol)
match_info.append({
'index': idx + 1,
'smiles': Chem.MolToSmiles(mol),
'status': 'included',
'matches': include_matches
})
else:
match_info.append({
'index': idx + 1,
'smiles': Chem.MolToSmiles(mol),
'status': 'no_match',
'matches': []
})
else:
# No inclusion patterns, keep all non-excluded
filtered.append(mol)
match_info.append({
'index': idx + 1,
'smiles': Chem.MolToSmiles(mol),
'status': 'included',
'matches': []
})
return filtered, match_info
def write_molecules(molecules, output_file):
"""Write molecules to file."""
output_path = Path(output_file)
if output_path.suffix.lower() in ['.sdf']:
writer = Chem.SDWriter(str(output_path))
for mol in molecules:
writer.write(mol)
writer.close()
elif output_path.suffix.lower() in ['.smi', '.smiles', '.txt']:
with open(output_path, 'w') as f:
for mol in molecules:
smiles = Chem.MolToSmiles(mol)
name = mol.GetProp('_Name') if mol.HasProp('_Name') else ''
f.write(f"{smiles} {name}\n")
else:
print(f"Error: Unsupported output format: {output_path.suffix}")
return
print(f"Wrote {len(molecules)} molecules to {output_file}")
def write_report(match_info, output_file):
"""Write detailed match report."""
import csv
with open(output_file, 'w', newline='') as f:
fieldnames = ['Index', 'SMILES', 'Status', 'Matches']
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for info in match_info:
writer.writerow({
'Index': info['index'],
'SMILES': info['smiles'],
'Status': info['status'],
'Matches': ', '.join(info['matches'])
})
def print_summary(total, filtered, match_info):
"""Print filtering summary."""
print("\n" + "="*60)
print("Filtering Summary")
print("="*60)
print(f"Total molecules: {total}")
print(f"Passed filter: {len(filtered)}")
print(f"Filtered out: {total - len(filtered)}")
print(f"Pass rate: {len(filtered)/total*100:.1f}%")
# Count by status
status_counts = {}
for info in match_info:
status = info['status']
status_counts[status] = status_counts.get(status, 0) + 1
print("\nBreakdown:")
for status, count in status_counts.items():
print(f" {status:15s}: {count}")
print("="*60)
def main():
parser = argparse.ArgumentParser(
description='Filter molecules by substructure patterns',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=f"""
Pattern libraries:
--filter-type functional-groups Common functional groups
--filter-type rings Ring systems
--filter-type pains PAINS (Pan-Assay Interference)
--filter-type privileged Privileged structures
Examples:
# Include molecules with benzene ring
python substructure_filter.py molecules.smi --pattern "c1ccccc1" -o filtered.smi
# Exclude reactive groups
python substructure_filter.py database.sdf --exclude "C(=O)Cl" -o clean.sdf
# Filter by functional groups
python substructure_filter.py molecules.smi --filter-type functional-groups -o fg.smi
# Remove PAINS
python substructure_filter.py compounds.smi --filter-type pains --exclude-mode -o clean.smi
# Multiple patterns
python substructure_filter.py mol.smi --pattern "c1ccccc1" --pattern "N" -o aromatic_amines.smi
"""
)
parser.add_argument('input', help='Input file (SDF or SMILES)')
parser.add_argument('--pattern', '-p', action='append',
help='SMARTS/SMILES pattern to include (can specify multiple)')
parser.add_argument('--exclude', '-e', action='append',
help='SMARTS/SMILES pattern to exclude (can specify multiple)')
parser.add_argument('--filter-type', choices=PATTERN_LIBRARIES.keys(),
help='Use predefined pattern library')
parser.add_argument('--exclude-mode', action='store_true',
help='Use filter-type patterns for exclusion instead of inclusion')
parser.add_argument('--match-all', action='store_true',
help='Molecule must match ALL include patterns')
parser.add_argument('--output', '-o', help='Output file')
parser.add_argument('--report', '-r', help='Write detailed report to CSV')
parser.add_argument('--list-patterns', action='store_true',
help='List available pattern libraries and exit')
args = parser.parse_args()
# List patterns mode
if args.list_patterns:
print("\nAvailable Pattern Libraries:")
print("="*60)
for lib_name, patterns in PATTERN_LIBRARIES.items():
print(f"\n{lib_name}:")
for name, pattern in patterns.items():
print(f" {name:25s}: {pattern}")
sys.exit(0)
# Load molecules
print(f"Loading molecules from: {args.input}")
molecules = load_molecules(args.input)
if not molecules:
print("Error: No valid molecules loaded")
sys.exit(1)
print(f"Loaded {len(molecules)} molecules")
# Prepare patterns
include_patterns = []
exclude_patterns = []
# Add custom include patterns
if args.pattern:
for pattern_str in args.pattern:
query = create_pattern_query(pattern_str)
if query:
include_patterns.append(('custom', query))
# Add custom exclude patterns
if args.exclude:
for pattern_str in args.exclude:
query = create_pattern_query(pattern_str)
if query:
exclude_patterns.append(('custom', query))
# Add library patterns
if args.filter_type:
lib_patterns = PATTERN_LIBRARIES[args.filter_type]
for name, pattern_str in lib_patterns.items():
query = create_pattern_query(pattern_str)
if query:
if args.exclude_mode:
exclude_patterns.append((name, query))
else:
include_patterns.append((name, query))
if not include_patterns and not exclude_patterns:
print("Error: No patterns specified")
sys.exit(1)
# Print filter configuration
print(f"\nFilter configuration:")
if include_patterns:
print(f" Include patterns: {len(include_patterns)}")
if args.match_all:
print(" Mode: Match ALL")
else:
print(" Mode: Match ANY")
if exclude_patterns:
print(f" Exclude patterns: {len(exclude_patterns)}")
# Perform filtering
print("\nFiltering...")
filtered, match_info = filter_molecules(
molecules,
include_patterns=include_patterns if include_patterns else None,
exclude_patterns=exclude_patterns if exclude_patterns else None,
match_all_include=args.match_all
)
# Print summary
print_summary(len(molecules), filtered, match_info)
# Write output
if args.output:
write_molecules(filtered, args.output)
if args.report:
write_report(match_info, args.report)
print(f"Detailed report written to: {args.report}")
if __name__ == '__main__':
main()