-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHowTo_Data_Strings.py
126 lines (96 loc) · 3.75 KB
/
HowTo_Data_Strings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# --- String creation ---
#
# Strings are iterable sliceable, immutable
s = str()
s = ''
s = 'Hello World'
len(s) # 11
'W' in s # True
s[0]
s[0:5]
s[::-1]
# s[10] # IndexError
# s[1] = 'z' # TypeError: 'str' object does not support item assignment
# --- change Capitalization ---
str('Hello World').upper() # 'HELLO WORLD'
str('Hello World').lower() # 'hello world'
str('hello world').capitalize() # 'Hello world'
str('hEllo World').title() # 'Hello World'
str('Hello World').swapcase() # 'hELLO wORLD'
# meta characters
print('Hello\n\tWorld\n')
print('Hello\\n\\tWorld\\n') # escape
print(r'C:\some\name') # r'' raw string, otherwise \name makes newline
print("It's signe!") # " and '
print('He sayd "It\'s ok" to her.') # ' and " and '
# --- manipulate the string ---
str(' Hello ').strip()
str('_Hello_').strip('_')
str('Hello World').removeprefix('Hello')
str('Hello World').removesuffix('World')
str('Hello World').replace('H', 'Z')
str('Hello World').split() # ['Hello', 'World']
str('Hello World').split('l') # ['He', '', 'o Wor', 'd']
str('Hello World').partition('l') # ('He', 'l', 'lo World')
str('Hello\nWorld').splitlines()
str('+').join('123') # '1+2+3', see below: concatenate tuple to 'str'
# --- search the string ---
len('Hello World') # 11
str('Hello World').index('H') # 0
str('Hello World').count('l') # 3
str('Hello World').find('l') # True
str('Hello World').endswith('d') # True
str('Hello World').startswith('Hel') # True
str('Hello World').startswith(('H','I','J','K')) # True
str('Hello World').islower() # False
str('12A45').isnumeric() # False
# loop over list of string (vectorize)
a = ['name', 'value']
b = ['A', 'B', 'value', 'C', 'name', 'D', 'E']
ix = [i in a for i in b]
b[ix] # list cannot be indexed by boolean, numpy arrays would work
''' --- real world problems ----------------------------------------------- '''
import pandas as pd
d = pd.DataFrame({"name": ['Irene', 'Maggie', 'Lisa'],
'value': ['CHF657.500.-','CHF2,847.700.-','CHF1,038.000.-'],
'area': ['23 m2', '121 m2', '63 m2']
})
d.name[1].split('g') # element-wise, python-built-in
d.name.str.split('g') # vectorized, df.str from pandas works on columns
d[[x.startswith('M') for x in d.name]] # select rows matching a string
d[d.name.str.startswith('M')] # same, vectorized
d.query("name.str.startswith('M')") # same
d['value_num'] = d['value'].str.replace('[^0-9]', '').astype(int) # clean 'CHF2,847.700.-'
d['area_num'] = d['area'].str.split(' ').str[0].astype(float) # clean '23 m2'
# --- combining strings: + or .format() or f"" ---
a = 'Jonny'
b = 'Depp'
c = 67
a + ' ' + b + ' is ' + str(c) + ' years old.'
str("{} {} is {} years old.").format(a, b, c) # Python version <3.6
f"{a} {b} is {c} years old." # modern, recommended because other types are ok:
c = (23,['A',276])
f"{a} {b} is {c} years old."
# format String
# https://docs.python.org/3/library/string.html#format-string-syntax
# https://www.w3schools.com/python/ref_string_format.asp
def mynum(): return float(3.182034975662637448)
"{:.2f}".format(mynum())
# --- string to other types ---
tuple('Hello')
list('Hello')
dict.fromkeys('Hello')
set('Hello')
# --- string from other types ---
# concatenate a tuple to a string:
str().join(('A', 'B', 'C'))
str('+').join(('A', 'B', 'C'))
str('+').join(('A', 'B', 23)) # join() TypeError: expected 'str'
values = ('A', 'B', 23.2)
str('+').join([str(value) for value in values]) # all to 'str
str('+').join(tuple(map(str, values))) # the same
data = [(1.2,2.9,3.5), (4.8,5.5,6.3)] # same, but now for multiple tuples
[tuple(map(str, item)) for item in data]
[tuple([str(x) for x in item]) for item in data] # same less readable
# --- encoding ---
# str('hällö Würld').encode('UTF-8') # ?