1 /**
2 	Utility functions for string processing
3 
4 	Copyright: © 2012-2014 Sönke Ludwig
5 	License: Subject to the terms of the MIT license, as written in the included LICENSE.txt file.
6 	Authors: Sönke Ludwig
7 */
8 module vibe.utils..string;
9 
10 public import std.string;
11 
12 import vibe.utils.array;
13 import vibe.container.internal.utilallocator;
14 
15 import std.algorithm;
16 import std.array;
17 import std.ascii;
18 import std.format;
19 import std.typecons : Yes;
20 import std.uni;
21 import std.utf;
22 import core.exception;
23 
24 
25 /**
26 	Takes a string with possibly invalid UTF8 sequences and outputs a valid UTF8 string as near to
27 	the original as possible.
28 */
29 deprecated("Use `std.encoding.sanitize` instead")
30 string sanitizeUTF8(in ubyte[] str)
31 @safe pure {
32 	import std.utf;
33 	auto ret = appender!string();
34 	ret.reserve(str.length);
35 
36 	size_t i = 0;
37 	while (i < str.length) {
38 		dchar ch = str[i];
39 		try ch = std.utf.decode(cast(const(char[]))str, i);
40 		catch( UTFException ){ i++; }
41 		//catch( AssertError ){ i++; }
42 		char[4] dst;
43 		auto len = std.utf.encode(dst, ch);
44 		ret.put(dst[0 .. len]);
45 	}
46 
47 	return ret.data;
48 }
49 
50 /**
51 	Strips the byte order mark of an UTF8 encoded string.
52 	This is useful when the string is coming from a file.
53 */
54 inout(char)[] stripUTF8Bom(inout(char)[] str)
55 @safe pure nothrow {
56 	if (str.length >= 3 && str[0 .. 3] == [0xEF, 0xBB, 0xBF])
57 		return str[3 ..$];
58 	return str;
59 }
60 
61 
62 /**
63 	Checks if all characters in 'str' are contained in 'chars'.
64 */
65 bool allOf(const(char)[] str, const(char)[] chars)
66 @safe pure {
67 	foreach (dchar ch; str)
68 		if (!chars.canFind(ch))
69 			return false;
70 	return true;
71 }
72 
73 /**
74 	Checks if any character in 'str' is contained in 'chars'.
75 */
76 bool anyOf(const(char)[] str, const(char)[] chars)
77 @safe pure {
78 	foreach (ch; str)
79 		if (chars.canFind(ch))
80 			return true;
81 	return false;
82 }
83 
84 
85 /// ASCII whitespace trimming (space and tab)
86 inout(char)[] stripLeftA(inout(char)[] s)
87 @safe pure nothrow {
88 	while (s.length > 0 && (s[0] == ' ' || s[0] == '\t'))
89 		s = s[1 .. $];
90 	return s;
91 }
92 
93 /// ASCII whitespace trimming (space and tab)
94 inout(char)[] stripRightA(inout(char)[] s)
95 @safe pure nothrow {
96 	while (s.length > 0 && (s[$-1] == ' ' || s[$-1] == '\t'))
97 		s = s[0 .. $-1];
98 	return s;
99 }
100 
101 /// ASCII whitespace trimming (space and tab)
102 inout(char)[] stripA(inout(char)[] s)
103 @safe pure nothrow {
104 	return stripLeftA(stripRightA(s));
105 }
106 
107 /// Finds the first occurence of any of the characters in `chars`
108 sizediff_t indexOfAny(const(char)[] str, const(char)[] chars)
109 @safe pure {
110 	foreach (i, char ch; str)
111 		if (chars.canFind(ch))
112 			return i;
113 	return -1;
114 }
115 alias countUntilAny = indexOfAny;
116 
117 /**
118 	Finds the closing bracket (works with any of '[', '$(LPAREN)', '<', '{').
119 
120 	Params:
121 		str = input string
122 		nested = whether to skip nested brackets
123 	Returns:
124 		The index of the closing bracket or -1 for unbalanced strings
125 		and strings that don't start with a bracket.
126 */
127 sizediff_t matchBracket(const(char)[] str, bool nested = true)
128 @safe pure nothrow {
129 	if (str.length < 2) return -1;
130 
131 	char open = str[0], close = void;
132 	switch (str[0]) {
133 		case '[': close = ']'; break;
134 		case '(': close = ')'; break;
135 		case '<': close = '>'; break;
136 		case '{': close = '}'; break;
137 		default: return -1;
138 	}
139 
140 	size_t level = 1;
141 	foreach (i, char c; str[1 .. $]) {
142 		if (nested && c == open) ++level;
143 		else if (c == close) --level;
144 		if (level == 0) return i + 1;
145 	}
146 	return -1;
147 }
148 
149 @safe unittest
150 {
151 	static struct Test { string str; sizediff_t res; }
152 	enum tests = [
153 		Test("[foo]", 4), Test("<bar>", 4), Test("{baz}", 4),
154 		Test("[", -1), Test("[foo", -1), Test("ab[f]", -1),
155 		Test("[foo[bar]]", 9), Test("[foo{bar]]", 8),
156 	];
157 	foreach (test; tests)
158 		assert(matchBracket(test.str) == test.res);
159 	assert(matchBracket("[foo[bar]]", false) == 8);
160 	static assert(matchBracket("[foo]") == 4);
161 }
162 
163 /// Same as std.string.format, just using an allocator.
164 string formatAlloc(Allocator, ARGS...)(scope Allocator alloc, string fmt, ARGS args)
165 {
166 	auto app = AllocAppender!string(alloc);
167 	formattedWrite(() @trusted { return &app; } (), fmt, args);
168 	return () @trusted { return app.data; } ();
169 }
170 
171 /// Special version of icmp() with optimization for ASCII characters
172 int icmp2(const(char)[] a, const(char)[] b)
173 @safe pure nothrow {
174 	size_t i = 0, j = 0;
175 
176 	// fast skip equal prefix
177 	size_t min_len = min(a.length, b.length);
178 	while( i < min_len && a[i] == b[i] ) i++;
179 	if( i > 0 && (a[i-1] & 0x80) ) i--; // don't stop half-way in a UTF-8 sequence
180 	j = i;
181 
182 	// compare the differing character and the rest of the string
183 	while(i < a.length && j < b.length){
184 		uint ac = cast(uint)a[i];
185 		uint bc = cast(uint)b[j];
186 		if( !((ac | bc) & 0x80) ){
187 			i++;
188 			j++;
189 			if( ac >= 'A' && ac <= 'Z' ) ac += 'a' - 'A';
190 			if( bc >= 'A' && bc <= 'Z' ) bc += 'a' - 'A';
191 			if( ac < bc ) return -1;
192 			else if( ac > bc ) return 1;
193 		} else {
194 			dchar acp = decode!(Yes.useReplacementDchar)(a, i);
195 			dchar bcp = decode!(Yes.useReplacementDchar)(b, j);
196 			if( acp != bcp ){
197 				acp = std.uni.toLower(acp);
198 				bcp = std.uni.toLower(bcp);
199 				if( acp < bcp ) return -1;
200 				else if( acp > bcp ) return 1;
201 			}
202 		}
203 	}
204 
205 	if( i < a.length ) return 1;
206 	else if( j < b.length ) return -1;
207 
208 	assert(i == a.length || j == b.length, "Strings equal but we didn't fully compare them!?");
209 	return 0;
210 }