Useful code snippets
- Creating a local HTTP server (Python 3.x):
- to read files contain foreign languages, use "ISO-8859-1", also known as "latin1":
- use "windows-1252":
- to filter strings with AND operators:
- to filter strings with OR operators:
- Reading a file:
- Writing to a file:
- Create a list based on existing lists:
- Flattening a list of lists:
- Chunking a list into n-sized chunks:
- Removing duplicates from a list:
- Merging two dictionaries:
- Using 'update' to merge two dictionaries:
- Using enumerate to get index/value pairs:
- Checking memory usage of an object:
- Getting the most frequent value in a list:
- Getting five most frequent values in a list:
- Counting the frequency of all the numbers in a list in decending order:
- Formatting the frequency of all the numbers in a list in decending order:
- Swapping two variables:
- Checking if a file exists:
- List slicing tricks: reversing a list
- JSON serialization and deserialization:
- Filtering most frequent 300 words from movie transcripts
#go to the website directory on the computer, and type 'cmd'
python -m http.server 8000
#open a web browser and go to http://localhost:8000
#if port 8000 is already in use, type another port number such as '8080'
#to stop the server, go back to the command prompt where the server is running and press Ctrl+C.import pandas as pd
data_set = pd.read_csv("file_path/file_name", encoding="ISO-8859-1")
print(data_set.head(5))import pandas as pd
data_set = pd.read_csv("file_path/file_name", encoding="windows-1252")
print(data_set.head(5))# Assuming df is a pandas DataFrame
# to filter strings with 'meet' but not 'nice to meet you' and 'meeting'
filtered_df = df[(df['English'].str.contains('meet', case=False)) & ~(df['English'].str.contains('nice to meet you|meeting', case=False))]
print(filtered_df)**When you're combining multiple conditions with logical operators in pandas, especially with bitwise operators like & (AND) and | (OR) that are used for element-wise logical operations on pandas series, it's important to use parentheses to group conditions. This is because the bitwise logical operators have a higher precedence than comparison operators like <, >, or ==. Without parentheses, Python might attempt to evaluate the expression in an order that doesn't match your intended logic, leading to errors or unexpected results
df = df[df['English'].str.contains('meet|Nice to meet you|meeting', case=False)]with open(r'C:\Users\Owner\Desktop\PYTHON\auto.csv', 'r') as file:
content = file.read()or you can use forward slashes to avoid getting 'unicode errors'.
Using open( ) for reading files is memory-efficient for large plain text files, such as reading line by line or for quick reads or simple data processing
with open('C:/Users/Owner/Desktop/PYTHON/auto.csv', 'r') as file:
content = file.read()with open('filename.txt', 'w') as file:
file.write('Hello, world!')squares = [x**2 for x in range(10)]flat_list = [item for sublist in list_of_lists for item in sublist]def chunk_list(lst, n):
for i in range(0, len(lst), n):
yield lst[i:i + n]
chunks = list(chunk_list([1, 2, 3, 4, 5], 2))unique_items = list(set(your_list))dict1 = {'a': 1, 'b': 2}
dict2 = {'b': 3, 'c': 4}
merged_dict = {**dict1, **dict2}
print(merged_dict)dict1 = {'a': 1, 'b': 2}
dict2 = {'b': 3, 'c': 4}
dict1.update(dict2)
print(dict1)for index, value in enumerate(your_list):
print(index, value)import sys
print(sys.getsizeof(your_object), "bytes")from collections import Counter
most_common = Counter(your_list).most_common(1)[0][0]from collections import Counter
def most_frequent_numbers(lst, n=5):
counts = Counter(lst)
return counts.most_common(n)
# Example usage:
my_list = [1, 2, 3, 4, 1, 2, 1, 5, 2, 2, 3]
result = most_frequent_numbers(my_list)
# Display result in two columns
print("Number\tFrequency")
for number, frequency in result:
print(f"{number}\t{frequency}")from collections import Counter
def sort_numbers_by_frequency(numbers):
# Count the frequency of each number
freq_counter = Counter(numbers)
# Sort by frequency in descending order
sorted_freq = sorted(freq_counter.items(), key=lambda x: x[1], reverse=True)
# Display in two columns (Number, Frequency)
print("Numbers\tFrequency")
for number, frequency in sorted_freq:
print(f"{number}\t{frequency}")
# Example usage:
numbers_list = [1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 3, 4, 4]
sort_numbers_by_frequency(numbers_list)from collections import Counter
def sort_numbers_by_frequency(numbers):
# Count the frequency of each number
freq_counter = Counter(numbers)
# Sort by frequency in descending order
sorted_freq = sorted(freq_counter.items(), key=lambda x: x[1], reverse=True)
# Display in two columns (Number, Frequency) with separator and dash lines
print("Numbers | Frequency")
print("-" * 20) # Dash line
for number, frequency in sorted_freq:
print(f"{number}{' '*(7-len(str(number)))}| {frequency}")
# Example usage:
numbers_list = [1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 3, 4, 4]
sort_numbers_by_frequency(numbers_list)a, b = b, aimport os
os.path.exists('filename.txt')reversed_list = your_list[::-1]import json
# Serialization
json_string = json.dumps(your_object)
# Deserialization
your_object = json.loads(json_string)import pandas as pd
from collections import Counter
import re
from pathlib import Path
# Define French stopwords
french_stopwords = {
'le', 'la', 'les', 'un', 'une', 'des', 'de', 'du', 'et', 'ou', 'où',
'je', 'tu', 'il', 'elle', 'nous', 'vous', 'ils', 'elles',
'me', 'te', 'se', 'lui', 'leur', 'leurs', 'mon', 'ma', 'mes',
'ton', 'ta', 'tes', 'son', 'sa', 'ses', 'notre', 'votre',
'ce', 'cet', 'cette', 'ces', 'ça', 'c',
'à', 'au', 'aux', 'dans', 'par', 'pour', 'sur', 'avec', 'sans',
'plus', 'moins', 'très', 'aussi', 'encore', 'déjà',
'qui', 'que', 'quoi', 'dont', 'y', 'en',
'ne', 'pas', 'non', 'oui', 'si',
'être', 'avoir', 'faire', 'dire', 'aller', 'voir', 'savoir', 'pouvoir',
'est', 'sont', 'était', 'ai', 'as', 'a', 'avons', 'avez', 'ont',
'suis', 'es', 'sommes', 'êtes',
'mais', 'donc', 'car', 'ni', 'or',
'tout', 'tous', 'toute', 'toutes', 'même', 'autre', 'autres',
'quel', 'quelle', 'quels', 'quelles', 'quelque', 'quelques',
'bien', 'alors', 'donc', 'voilà', 'là', 'ici',
'peut', 'peuvent', 'peux', 'veux', 'veut', 'voulons', 'voulez', 'veulent'
}
# Path to your CSV file - adjust this to your file location
csv_path = Path.home() / 'Desktop' / 'french_movies.csv' # Change filename as needed
# Read the CSV file
df = pd.read_csv(csv_path)
# Display the first few rows and column names to understand the structure
print("CSV Columns:", df.columns.tolist())
print("\nFirst few rows:")
print(df.head())
# Assuming the transcript text is in a column - adjust column name as needed
# Common column names might be: 'transcript', 'text', 'dialogue', 'subtitle', etc.
text_column = 'transcript' # CHANGE THIS to match your actual column name
# Combine all transcripts into one text
all_text = ' '.join(df[text_column].astype(str))
# Convert to lowercase and remove punctuation
all_text = all_text.lower()
all_text = re.sub(r'[!?.,;:()"\'\-–—]', ' ', all_text)
# Extract all words (alphabetic characters only)
words = re.findall(r'\b[a-zàâäæçéèêëïîôùûüÿœ]+\b', all_text)
# Filter out stopwords
filtered_words = [word for word in words if word not in french_stopwords]
# Count word frequencies
word_counts = Counter(filtered_words)
# Get the 200 most common words
top_200 = word_counts.most_common(200)
# Create a DataFrame for better visualization
results_df = pd.DataFrame(top_200, columns=['Word', 'Frequency'])
# Display results
print("\n" + "="*50)
print("TOP 200 MOST FREQUENT WORDS")
print("="*50)
print(results_df.to_string(index=False))
# Optionally save to CSV
output_path = Path.home() / 'Desktop' / 'top_200_french_words.csv'
results_df.to_csv(output_path, index=False, encoding='utf-8-sig')
print(f"\n✓ Results saved to: {output_path}")