1 /**
2 	Utility functions for string processing
3 
4 	Copyright: © 2012-2014 RejectedSoftware e.K.
5 	License: Subject to the terms of the MIT license, as written in the included LICENSE.txt file.
6 	Authors: Sönke Ludwig
7 */
8 module vibe.utils..string;
9 
10 public import std.string;
11 
12 import vibe.utils.array;
13 import vibe.internal.utilallocator;
14 
15 import std.algorithm;
16 import std.array;
17 import std.ascii;
18 import std.format;
19 import std.uni;
20 import std.utf;
21 import core.exception;
22 
23 
24 /**
25 	Takes a string with possibly invalid UTF8 sequences and outputs a valid UTF8 string as near to
26 	the original as possible.
27 */
28 string sanitizeUTF8(in ubyte[] str)
29 @safe pure {
30 	import std.utf;
31 	auto ret = appender!string();
32 	ret.reserve(str.length);
33 
34 	size_t i = 0;
35 	while (i < str.length) {
36 		dchar ch = str[i];
37 		try ch = std.utf.decode(cast(const(char[]))str, i);
38 		catch( UTFException ){ i++; }
39 		//catch( AssertError ){ i++; }
40 		char[4] dst;
41 		auto len = std.utf.encode(dst, ch);
42 		ret.put(dst[0 .. len]);
43 	}
44 
45 	return ret.data;
46 }
47 
48 /**
49 	Strips the byte order mark of an UTF8 encoded string.
50 	This is useful when the string is coming from a file.
51 */
52 inout(char)[] stripUTF8Bom(inout(char)[] str)
53 @safe pure nothrow {
54 	if (str.length >= 3 && str[0 .. 3] == [0xEF, 0xBB, 0xBF])
55 		return str[3 ..$];
56 	return str;
57 }
58 
59 
60 /**
61 	Checks if all characters in 'str' are contained in 'chars'.
62 */
63 bool allOf(const(char)[] str, const(char)[] chars)
64 @safe pure {
65 	foreach (dchar ch; str)
66 		if (!chars.canFind(ch))
67 			return false;
68 	return true;
69 }
70 
71 ptrdiff_t indexOfCT(Char)(in Char[] s, dchar c, CaseSensitive cs = CaseSensitive.yes)
72 @safe pure {
73 	if (__ctfe) {
74 		if (cs == CaseSensitive.yes) {
75 			foreach (i, dchar ch; s)
76 				if (ch == c)
77 					return i;
78 		} else {
79 			c = std.uni.toLower(c);
80 			foreach (i, dchar ch; s)
81 				if (std.uni.toLower(ch) == c)
82 					return i;
83 		}
84 		return -1;
85 	} else return std..string.indexOf(s, c, cs);
86 }
87 ptrdiff_t indexOfCT(Char)(in Char[] s, in Char[] needle)
88 {
89 	if (__ctfe) {
90 		if (s.length < needle.length) return -1;
91 		foreach (i; 0 .. s.length - needle.length)
92 			if (s[i .. i+needle.length] == needle)
93 				return i;
94 		return -1;
95 	} else return std..string.indexOf(s, needle);
96 }
97 
98 /**
99 	Checks if any character in 'str' is contained in 'chars'.
100 */
101 bool anyOf(const(char)[] str, const(char)[] chars)
102 @safe pure {
103 	foreach (ch; str)
104 		if (chars.canFind(ch))
105 			return true;
106 	return false;
107 }
108 
109 
110 /// ASCII whitespace trimming (space and tab)
111 inout(char)[] stripLeftA(inout(char)[] s)
112 @safe pure nothrow {
113 	while (s.length > 0 && (s[0] == ' ' || s[0] == '\t'))
114 		s = s[1 .. $];
115 	return s;
116 }
117 
118 /// ASCII whitespace trimming (space and tab)
119 inout(char)[] stripRightA(inout(char)[] s)
120 @safe pure nothrow {
121 	while (s.length > 0 && (s[$-1] == ' ' || s[$-1] == '\t'))
122 		s = s[0 .. $-1];
123 	return s;
124 }
125 
126 /// ASCII whitespace trimming (space and tab)
127 inout(char)[] stripA(inout(char)[] s)
128 @safe pure nothrow {
129 	return stripLeftA(stripRightA(s));
130 }
131 
132 /// Finds the first occurence of any of the characters in `chars`
133 sizediff_t indexOfAny(const(char)[] str, const(char)[] chars)
134 @safe pure {
135 	foreach (i, char ch; str)
136 		if (chars.canFind(ch))
137 			return i;
138 	return -1;
139 }
140 alias countUntilAny = indexOfAny;
141 
142 /**
143 	Finds the closing bracket (works with any of '[', '$(LPAREN)', '<', '{').
144 
145 	Params:
146 		str = input string
147 		nested = whether to skip nested brackets
148 	Returns:
149 		The index of the closing bracket or -1 for unbalanced strings
150 		and strings that don't start with a bracket.
151 */
152 sizediff_t matchBracket(const(char)[] str, bool nested = true)
153 @safe pure nothrow {
154 	if (str.length < 2) return -1;
155 
156 	char open = str[0], close = void;
157 	switch (str[0]) {
158 		case '[': close = ']'; break;
159 		case '(': close = ')'; break;
160 		case '<': close = '>'; break;
161 		case '{': close = '}'; break;
162 		default: return -1;
163 	}
164 
165 	size_t level = 1;
166 	foreach (i, char c; str[1 .. $]) {
167 		if (nested && c == open) ++level;
168 		else if (c == close) --level;
169 		if (level == 0) return i + 1;
170 	}
171 	return -1;
172 }
173 
174 @safe unittest
175 {
176 	static struct Test { string str; sizediff_t res; }
177 	enum tests = [
178 		Test("[foo]", 4), Test("<bar>", 4), Test("{baz}", 4),
179 		Test("[", -1), Test("[foo", -1), Test("ab[f]", -1),
180 		Test("[foo[bar]]", 9), Test("[foo{bar]]", 8),
181 	];
182 	foreach (test; tests)
183 		assert(matchBracket(test.str) == test.res);
184 	assert(matchBracket("[foo[bar]]", false) == 8);
185 	static assert(matchBracket("[foo]") == 4);
186 }
187 
188 /// Same as std.string.format, just using an allocator.
189 string formatAlloc(ARGS...)(IAllocator alloc, string fmt, ARGS args)
190 {
191 	auto app = AllocAppender!string(alloc);
192 	formattedWrite(() @trusted { return &app; } (), fmt, args);
193 	return () @trusted { return app.data; } ();
194 }
195 
196 /// Special version of icmp() with optimization for ASCII characters
197 int icmp2(const(char)[] a, const(char)[] b)
198 @safe pure {
199 	size_t i = 0, j = 0;
200 
201 	// fast skip equal prefix
202 	size_t min_len = min(a.length, b.length);
203 	while( i < min_len && a[i] == b[i] ) i++;
204 	if( i > 0 && (a[i-1] & 0x80) ) i--; // don't stop half-way in a UTF-8 sequence
205 	j = i;
206 
207 	// compare the differing character and the rest of the string
208 	while(i < a.length && j < b.length){
209 		uint ac = cast(uint)a[i];
210 		uint bc = cast(uint)b[j];
211 		if( !((ac | bc) & 0x80) ){
212 			i++;
213 			j++;
214 			if( ac >= 'A' && ac <= 'Z' ) ac += 'a' - 'A';
215 			if( bc >= 'A' && bc <= 'Z' ) bc += 'a' - 'A';
216 			if( ac < bc ) return -1;
217 			else if( ac > bc ) return 1;
218 		} else {
219 			dchar acp = decode(a, i);
220 			dchar bcp = decode(b, j);
221 			if( acp != bcp ){
222 				acp = std.uni.toLower(acp);
223 				bcp = std.uni.toLower(bcp);
224 				if( acp < bcp ) return -1;
225 				else if( acp > bcp ) return 1;
226 			}
227 		}
228 	}
229 
230 	if( i < a.length ) return 1;
231 	else if( j < b.length ) return -1;
232 
233 	assert(i == a.length || j == b.length, "Strings equal but we didn't fully compare them!?");
234 	return 0;
235 }