Please start any new threads on our new
site at https://forums.sqlteam.com. We've got lots of great SQL Server
experts to answer whatever question you can come up with.
Author |
Topic |
Igor2004
More clever than you
78 Posts |
Posted - 2005-06-17 : 22:47:13
|
[code] -- Author: Igor Nikiforov, Montreal, EMail: udfs@sympatico.ca -- GETWORDCOUNT() User-Defined Function Counts the words in a string. -- GETWORDCOUNT(@cString[, @cDelimiters]) -- Parameters -- @cString nvarchar(4000) - Specifies the string whose words will be counted. -- @cDelimiters nvarchar(256) - Optional. Specifies one or more optional characters used to separate words in @cString. -- The default delimiters are space, tab, carriage return, and line feed. Note that GETWORDCOUNT( ) uses each of the characters in @cDelimiters as individual delimiters, not the entire string as a single delimiter. -- Return Value smallint -- Remarks GETWORDCOUNT() by default assumes that words are delimited by spaces or tabs. If you specify another character as delimiter, this function ignores spaces and tabs and uses only the specified character. -- If you use 'AAA aaa, BBB bbb, CCC ccc.' as the target string for dbo.GETWORDCOUNT(), you can get all the following results. -- declare @cString nvarchar(4000) -- set @cString = 'AAA aaa, BBB bbb, CCC ccc.' -- select dbo.GETWORDCOUNT(@cString, default) -- 6 - character groups, delimited by ' ' -- select dbo.GETWORDCOUNT(@cString, ',') -- 3 - character groups, delimited by ',' -- select dbo.GETWORDCOUNT(@cString, '.') -- 1 - character group, delimited by '.' -- See Also GETWORDNUM() User-Defined Function -- UDF the name and functionality of which correspond to the same built-in function of Visual FoxProCREATE function GETWORDCOUNT (@cSrting nvarchar(4000), @cDelimiters nvarchar(256) )returns smallint as begin -- if no break string is specified, the function uses spaces, tabs and line feed to delimit words. set @cDelimiters = isnull(@cDelimiters, space(1)+char(9)+char(10)) declare @p smallint, @end_of_string smallint, @wordcount smallint select @p = 1, @wordcount = 0 select @end_of_string = 1 + datalength(@cSrting)/(case SQL_VARIANT_PROPERTY(@cSrting,'BaseType') when 'nvarchar' then 2 else 1 end) -- for unicode while dbo.CHARINDEX_BIN(substring(@cSrting, @p, 1), @cDelimiters, 1) > 0 and @end_of_string > @p -- skip opening break characters, if any set @p = @p + 1 if @p < @end_of_string begin set @wordcount = 1 -- count the one we are in now count transitions from 'not in word' to 'in word' -- if the current character is a break char, but the next one is not, we have entered a new word while @p < @end_of_string begin if @p +1 < @end_of_string and dbo.CHARINDEX_BIN(substring(@cSrting, @p, 1), @cDelimiters, 1) > 0 and dbo.CHARINDEX_BIN(substring(@cSrting, @p+1, 1), @cDelimiters, 1) = 0 select @wordcount = @wordcount + 1, @p = @p + 1 -- Skip over the first character in the word. We know it cannot be a break character. set @p = @p + 1 end end return @wordcount endGO -- Author: Igor Nikiforov, Montreal, EMail: udfs@sympatico.ca -- GETWORDNUM() User-Defined Function -- Returns a specified word from a string. -- GETWORDNUM(@cString, @nIndex[, @cDelimiters]) -- Parameters @cString nvarchar(4000) - Specifies the string to be evaluated -- @nIndex smallint - Specifies the index position of the word to be returned. For example, if @nIndex is 3, GETWORDNUM( ) returns the third word (if @cString contains three or more words). -- @cDelimiters nvarchar(256) - Optional. Specifies one or more optional characters used to separate words in @cString. -- The default delimiters are space, tab, carriage return, and line feed. Note that GetWordNum( ) uses each of the characters in @cDelimiters as individual delimiters, not the entire string as a single delimiter. -- Return Value nvarchar(4000) -- Remarks Returns the word at the position specified by @nIndex in the target string, @cString. If @cString contains fewer than @nIndex words, GETWORDNUM( ) returns an empty string. -- See Also -- GETWORDCOUNT() User-Defined Function -- UDF the name and functionality of which correspond to the same built-in function of Visual FoxProCREATE function GETWORDNUM (@cSrting nvarchar(4000), @nIndex smallint, @cDelimiters nvarchar(256) )returns nvarchar(4000)as begin -- if no break string is specified, the function uses spaces, tabs and line feed to delimit words. set @cDelimiters = isnull(@cDelimiters, space(1)+char(9)+char(10)) declare @i smallint, @j smallint, @p smallint, @q smallint, @qmin smallint, @end_of_string smallint, @LenDelimiters smallint, @outstr nvarchar(4000) select @i = 1, @p = 1, @q = 0, @outstr = '' select @end_of_string = 1 + datalength(@cSrting)/(case SQL_VARIANT_PROPERTY(@cSrting,'BaseType') when 'nvarchar' then 2 else 1 end) -- for unicode select @LenDelimiters = datalength(@cDelimiters)/(case SQL_VARIANT_PROPERTY(@cDelimiters,'BaseType') when 'nvarchar' then 2 else 1 end) -- for unicode while @i <= @nIndex begin while dbo.CHARINDEX_BIN(substring(@cSrting, @p, 1), @cDelimiters, 1) > 0 and @end_of_string > @p -- skip opening break characters, if any set @p = @p + 1 if @p >= @end_of_string break select @j = 1, @qmin = @end_of_string -- find next break character it marks the end of this word while @j <= @LenDelimiters begin set @q = dbo.CHARINDEX_BIN(substring(@cDelimiters, @j, 1), @cSrting, @p) set @j = @j + 1 if @q > 0 and @qmin > @q set @qmin = @q end if @i = @nIndex -- this is the actual word we are looking for begin set @outstr = substring(@cSrting, @p, @qmin-@p) break end set @p = @qmin + 1 if (@p >= @end_of_string) break set @i = @i + 1 end return @outstr endGO-- Is similar to the built-in function Transact-SQL charindex, but regardless of collation settings, -- executes case-sensitive search -- Author: Igor Nikiforov, Montreal, EMail: udfs@sympatico.ca CREATE function CHARINDEX_BIN(@expression1 nvarchar(4000), @expression2 nvarchar(4000), @start_location smallint = 1)returns nvarchar(4000)as begin return charindex( cast(@expression1 as nvarchar(4000)) COLLATE Latin1_General_BIN, cast(@expression2 as nvarchar(4000)) COLLATE Latin1_General_BIN, @start_location ) endGO[/code] |
|
myleslee
Starting Member
1 Post |
Posted - 2006-05-31 : 22:23:54
|
select dbo.getWordCount('we are the world , we are the children.', default)=========the result is 9 instead of 8.Because a space was accidentally input before the comma. But it would be great to filter out punctuations.thanks. |
|
|
SwePeso
Patron Saint of Lost Yaks
30421 Posts |
Posted - 2006-06-01 : 02:28:32
|
[code]A real quicky could look like this code snippet below CREATE FUNCTION dbo.fnFastWordCount( @Text NVARCHAR(4000))RETURNS SMALLINTASBEGIN -- Get rid of prefix and suffix spaces SELECT @Text = SUBSTRING(@Text, PATINDEX('%[^\ ]%', @Text), 4000), @Text = LEFT(@Text, LEN(@Text)) -- Get rid of all unwanted, non word characters. Add more if you want to, such as CRLF... WHILE CHARINDEX(',', @Text) > 0 SELECT @Text = REPLACE(@Text, ',', ' ') WHILE CHARINDEX('.', @Text) > 0 SELECT @Text = REPLACE(@Text, '.', ' ') -- Get rid of all double spaces WHILE CHARINDEX(' ', @Text) > 0 SELECT @Text = REPLACE(@Text, ' ', ' ') RETURN LEN(@Text) - LEN(REPLACE(@Text, ' ', '')) + 1END This is about 15 times faster than Igor's and does the same job and better.You can easily add lines of code to remove other non wanted, non word characters.Then my function is only about 10 times faster.Here is a test script to measure calculation times declare @cString nvarchar(4000), @start datetime, @i smallint, @dummy smallintselect @cstring = '2068 2683 9537 3822 9752 6135 5787 3431 8676 4790 9175 1727 71171364 4590 3954 7801 5188 7858 1898 2880 9547 3250 9819 8947 9711 4250 2796 4133 37511010 8701 7803 4055 4444 5473 6376 120 9747 6026 1193 1231 8050 8914 3849 9617 2920 17489860 1802 2458 4478 7557 8822 2118 3950 3099 7947 645 7463 1600 9353 4802 4464 2087 55727487 6043 6141 7614 2328 8417 4362 2577 9760 2697 8306 6100 9080 455 3464 4976 8726 17409764 3498 4610 5623 1503 4089 3885 9460 9702 6314 2302 7251 2603 1371 7142 7995 59241722 6945 1294 3391 2527 700 8494 6059 6642 1330 8595 7815 1518 9539 3731 1493 434 9855231 7649 77 1618 2809 4566 3435 2943 2391 1272 8002 5050 4793 9833 2779 7001 1291 50393372 118 7584 6020 1837 4709 5845 6834 2959 6987 6378 8491 1038 1231 1266 7081 4718 80835554 6123 3397 2403 367 1244 3877 4611 4088 3440 2040 9007 4701 5846 8952 1711 5618 95773530 9105 1444 4911 6263 9763 5064 1703 8253 7978 5861 479 781 7696 2272 737 1705 61143449 5101 7534 2216 2232 3634 189 4516 9898 2343 1758 1079 3355 5283 3830 3346 2788 288579 7825 76 9115 2381 8360 192 8157 2025 9065 9718 2047 4317 5613 6663 6745 9032 96128400 2092 1971 4345 8831 5662 2016 839 5364 1428 940 3910 1729 2142 3296 7482 5901 57291265 5725 7867 3910 6671 140 4385 4345 7497 7469 6243 3963 6847 3578 219 6265 8941 26123279 4984 5113 7584 4559 2499 7202 165 8266 2626 4373 5454 3228 4899 7204 9455 9209 18354601 4145 6457 5980 7994 8379 7547 3584 714 192 9166 214 1122 7769 9913 3947 876 8509213 648 8266 9662 8119 5271 9323 1208 9532 1681 1400 2405 7980 13 4377 7854 4677 74043613 6893 4054 6323 2453 2065 3261 6080 6622 3811 6955 3433 431 1270 2174 134 832 1661964 6675 7013 458 8151 3387 9441 8209 3834 2602 9036 8070 3807 2492 6535 498 9441 83901358 335 5709 6474 2654 7277 1030 4 4063 5413 5859 7511 9728 423 3688 7534 5502 17392171 1029 6241 9735 6344 8173 7204 2743 3892 1309 8805 1524 7100 2280 8237 3298 13551171 749 5045 9571 6275 6854 431 8212 430 414 2045 5 4366 1646 6257 4593 1341 1741 96969301 5545 8882 523 4406 2564 8565 5335 7965 5879 7167 8312 7167 6235 7408 5723 9411 4199365 1999 6169 6687 5185 5997 5394 561 7677 6900 874 9911 7375 7229 209 8028 5376 60472402 9885 5502 8415 4319 6201 8162 5084 1012 259 2979 8939 7874 5782 3619 4659 2990 14914810 8378 8242 4566 8064 4871 1409 133 9371 8833 8337 7228 7434 3859 8112 5348 1232 86265450 8807 5598 2260 8620 8522 4689 2368 9389 3995 2598 7161 8055 3147 230 8026 66108245 7394 1908 4452 790 9186 5141 2829 1172 6672 8349 6880 2402 2655 2575 267 5542 79174212 5516 4533 5955 7644 9943 4971 1173 5092 2194 4821 8674 7882 4348 7342 2688 66503415 5591 1622 4746 3109 2212 7944 2214 6546 8878 8441 4762 3887 1135 8050 9264 5944200 7339 2773 2699 7501 8405 6828 9842 5579 2686 4680 9840 1898 1867 4479 1910 82902840 7612 1484 4386 1108 7259 4922 4813 2762 3008 3071 8499 3422 7462 1564 5377 480 799310 4471 2752 7588 6784 940 1772 8062 4279 1130 3545 5900 4030 107 5497 8430 62195387 1403 33 6462 3989 2000 2596 1409 9734 1532 9947 8591 6058 2883 1659 586 8078 59426510 2541 6429 613 8370 2038 730 7418 2878 3636 589 1526 6466 1341 7560 9233 6155 14932088 3287 2815 311 6835 5049 7330 8226 5477 95 4653 7835 9914 6754 6795 6106 742 11453347 9864 8913 4597 9781 5312 4520 7359 5627 9082 2219 6434 8117 9523 4730 3607 74793256 8196 3929 507 5006 5579 7958 1154 8553 5859 3346 6916 8440 6620 701 2109 9708 96149853 1679 3421 5338 6324 7272 7107 4841 4387 2649 7543 631 1616 7369 3726 3575 8575 30763842 8114 6125 4625 7139 5317 2841 1705 9737 3320 5593 3937 6119 9905 7831 2665 80886222 3628 2654 1165 6176 562 6658 1441 8473 5209 3192 6102 2678 9354 4569 1840 6485 78484678 6013 2440 6030 8265 1806 7656 649 9672 740 5526 8697 4738 3226 791 1147 3540 5345895 9617 3244 2356 7214 9926 1569 1522 1123 1666 8888 5762 4597 2663 3149 5674 72644985 9669 5375 2868 3389 1534 4557 5983 7415 1993 2723 9341 2079 3324 6713 3422 277 98186783 6385 6768 4563 6983 298 3193 2027 6774 5524 7562 168 8231 1950 9397 3438 7290 46129327 1346 8580 4657'select @start = getdate(), @i = 1000/* @i is 1000 -> whole test ~ 70 sec @i is 100 -> whole test ~ 7 sec @i is 10 -> whole test ~ 1 sec*/while @i > 0 select @dummy = dbo.GETWORDCOUNT(@cString, default), @i = @i -1select datediff(ms, @start, getdate()) 'ms'select @start = getdate(), @i = 1000while @i > 0 select @dummy = dbo.fnFastWordCount(@cString), @i = @i - 1select datediff(ms, @start, getdate()) 'ms'[/code] |
|
|
spirit1
Cybernetic Yak Master
11752 Posts |
Posted - 2006-06-01 : 06:20:17
|
oh my god.... Igor's function has an error?!!?!??!??!???That's unbeliveable!!!!Someone get me some aspirin.... quick!!!Go with the flow & have fun! Else fight the flow Blog thingie: [URL="http://weblogs.sqlteam.com/mladenp"] |
|
|
SamC
White Water Yakist
3467 Posts |
Posted - 2006-06-01 : 08:42:13
|
A sticky for functions like these would be handy. I've seen several come and go over the years. |
|
|
Michael Valentine Jones
Yak DBA Kernel (pronounced Colonel)
7020 Posts |
Posted - 2006-06-01 : 10:51:40
|
quote: Originally posted by myleslee select dbo.getWordCount('we are the world , we are the children.', default)=========the result is 9 instead of 8.Because a space was accidentally input before the comma. But it would be great to filter out punctuations.thanks.
You just used the function incorrectly.You need to specify the comma as one of the delimeters, or modify it to include commas as a default delimiter.declare @Delim nvarchar(256)set @Delim = space(1)+char(9)+char(10)+','select [Word Count] = dbo.getWordCount('we are the world , we are the children.',@Delim) Results:Word Count ---------- 8(1 row(s) affected) Igor really is a very clever fellow.CODO ERGO SUM |
|
|
Igor2004
More clever than you
78 Posts |
Posted - 2006-06-08 : 11:07:49
|
To Peso1) Your function doesn’t does the same job and better.See, please, attentively the description of my function.Your function works with delimiters ' .,'.2) Your function isn’t about 15 times faster than my function.Try it, please.declare @cString nvarchar(4000), @start datetime, @i smallint, @cnt smallint, @dummy smallint, @test1 float, @test2 floatselect @cString = '2068,,,, 2683 9537 3822 9752 61.35 57 87 3 431 8676 4790 9175 1727 7117 1364 4590 3954 7801 5188 7858 1898 2880 9547 3250 9819 8947 9711 4250 2796 4133 3751 1010 8701 7803 4055 4444 5473 6376 120 9747 6026 1193 1231 8050 8914 3849 9617 292.............................2527 700 8494 6059 6642 1330 8595 7815 1518 9539 3731 1493 434 985 5231 7649 77 1618 2809 4566 3435 2943 2391 1272 8002 5050 4793 9833 2779 7001 1291 5039 3372 118 7584 6020 1837 4709 5845 6834 2959 6987 6378 8491 1038 1231 1266 7081 4718 8083 5554 6123 3397 2403 367 1244 3877 4611 4088 3440 2040 9007 4701 5846 8952 1711 5618 9577 3530 9105 1444 4911 6263 9763 5064 1703 8253 7978 5861 479 781 7696 2272 737 1705 6114 3449 5101 7534 2216 2232 3634 189 4516 9898 2343 1758 1079 3355 5283 3830 3346 2788 28 8579 7825 76 9115 2381 8360 192 8157 2025 9065 9718 2047 4317 5613 6663 6745 9032 9612 8400 2092 1971 4345 8831 5662 2016 839 5364 1428 940 3910 1729 2142 3296 7482 5901 5729 1265 5725 7867 3910 6671 140 4385 4345 7497 7469 6243 3963 6847 3578 219 6265 8941 2612 3279 4984 5113 7584 4559 2499 7202 165 8266 2626 4373 5454 3228 4899 7204 9455 9209 1835 4601 4145 6457 5980 7994 8379 7547 3584 714 192 9166 214 1122 7769 9913 3947 876 850 9213 648 8266 9662 8119 5271 9323 1208 9532 1681 1400 2405 7980 13 4377 7854 4677 7404 3613 6893 4054 6323 2453 2065 3261 6080 6622 3811 6955 3433 431 1270 2174 134 832 166 1964 6675 7013 458 8151 3387 9441 8209 3834 2602 9036 8070 3807 2492 6535 498 9441 8390 1358 335 5709 6474 2654 7277 1030 4 4063 5413 5859 7511 9728 423 3688 7534 5502 1739 2171 1029 6241 9735 6344 8173 7204 2743 3892 1309 8805 1524 7100 2280 8237 3298 1355 1171 749 5045 9571 6275 6854 431 8212 430 414 2045 5 4366 1646 6257 4593 1341 1741 9696 9301 5545 8882 523 4406 2564 8565 5335 7965 5879 7167 8312 7167 6235 7408 5723 9411 419 9365 1999 6169 6687 5185 5997 5394 561 7677 6900 874 9911 7375 7229 209 8028 5376 6047 2402 9885 5502 8415 4319 6201 8162 5084 1012 259 2979 8939 7874 5782 3619 4659 2990 1491 4810 8378 8242 4566 8064 4871 1409 133 9371 8833 8337 7228 7434 3859 8112 5348 1232 8626 5450 8807 5598 2260 8620 8522 4689 2368 9389 3995 2598 7161 8055 3147 230 8026 6610 8245 7394 1908 4452 790 9186 5141 2829 1172 6672 8349 6880 2402 2655 2575 267 5542 7917 4212 5516 4533 5955 7644 9943 4971 1173 5092 2194 4821 8674 7882 4348 7342 2688 6650 3415 5591 1622 4746 3109 2212 7944 2214 6546 8878 8441 4762 3887 1135 8050 9264 594 4200 7339 2773 2699 7501 8,,,,,,,,,,,,, 4479 1910 8290 2840 7612 1484 4386 1108 7259 4922 4813 2762 3008 3071 8499 3422 7462 1564 5377 480 799 310 4471 2752 7588 6784 940 1772 8062 4279 1130 3545 5900 4030 107 5497 8430 6219 5387 1403 33 6462 3989 2000 2596 1409 9734 1532 9947 8591 6058 2883 1659 586 8078 5942 6510 2541 6429 613 8370 2038 730 7418 2878 3636 589 1526 6466 1341 7560 9233 6155 1493 2088 3287 2815 311 6835 5049 7330 8226 5477 95 4653 7835 9914 6754 6795 6106 742 1145 3347 9864 8913 4597 9781 5312 4520 7359 5627 9082 2219 6434 8117 9523 4730 3607 7479 3256 8196 3929 507 5006 5579 7958 1154 8553 5859 3346 6916 8440 6620 701 2109 9708 9614 9853 1679 3421 5338 6324 7272 7107 4841 4387 2649 7543 631 1616 7369 3726 3575 8575 3076 3842 8114 6125 4625 7139 5317 2841 1705 9737 3320 5593 3937 6119 9905 7831 2665 8088 6222 3628 2654 1165 6176 562 6658 1441 8473 5209 3192 6102 2678 9354 4569 1840 6485 7848 4678 6013 2440 6030 8265 1806 7656 649 9672 740 5526 8697 4738 3226 791 1147 3540 534 5895 9617 3244 2356 7214 9926 1569 1522 1123 1666 8888 5762 4597 2663 3149 5674 7264 4985 9669 5375 2868 3389 1534 4557 5983 7415 1993 2723 9341 2079 3324 6713 3422 277 9818 6783 6385 6768 4563 6983 298 3193 2027 6774 5524 7562 168 8231 1950 9. 39..7 3438 ...... 7290 4612,,,, 9327 1346 .... .... 8580 465..7'set @cnt = 1000/* @cnt is 1000 -> whole test ~ 70 sec @cnt is 100 -> whole test ~ 7 sec @cnt is 10 -> whole test ~ 1 sec*/select @start = getdate(), @i = @cntwhile @i > 0 select @dummy = dbo.GETWORDCOUNT(@cString, ',. '), @i = @i -1set @test1 = datediff(ms, @start, getdate())select @test1 'GETWORDCOUNT is slow ms'select @start = getdate(), @i = @cntwhile @i > 0 select @dummy = dbo.fnFastWordCount(@cString), @i = @i - 1set @test2 = datediff(ms, @start, getdate())select @test2 'fnFastWordCount is very, very fast ms'select @test1/@test2 'faster ? 15 time ? You are sure ? :-) ' Regards,Igor |
|
|
Igor2004
More clever than you
78 Posts |
Posted - 2006-06-08 : 11:51:53
|
to Pesoin conclusionYour "better" fonction is ErroneousTry it, pleaseselect dbo.fnFastWordCount('.'), dbo.fnFastWordCount(' ') -- display 1, 1FYI quote: The accuracy criteria of user-defined functionsGETWORDNUM() , GETWORDCOUNT(), GETALLWORDS(), GETALLWORDS2() working with strings:1) Both character parameters are empty - the function returns nothing - either 0 strings, or 0.2) The first parameter is not empty, the second is empty - the function returns the table from one string or 1. The first parameter is in this case the required word.3) The function works correctly with strings starting and/or finishing with one or several delimiters.4) The function works correctly on strings consisting only of delimiters - a result in this case is 0 strings, or 0 words.5) The function works correctly irrespective of the character sets in the string or delimiters, including /, \,?, ^, %, -, ' etc., that is, any character that have special function. No exceptions, if the parameters of an incorrect type are transmitted, there is a standard error message.Above mentioned and other functions completely correspond to these criteria.
See, please [url]http://www.universalthread.com/wconnect/wc.dll?LevelExtreme~2,54,33,27115[/url]Igor |
|
|
X002548
Not Just a Number
15586 Posts |
|
SwePeso
Patron Saint of Lost Yaks
30421 Posts |
Posted - 2006-06-08 : 16:27:35
|
quote: 2) Your function isn’t about 15 times faster than my function.Try it, please.Regards,Igor
I tried. I got 50.0 ms for your function and 0.0 ms for my function, and division by zero. Does that mean that my function is infinity times faster than yours? I was just pointing out that there were others ways to go... No hard feelings? |
|
|
Igor2004
More clever than you
78 Posts |
Posted - 2006-06-08 : 17:19:38
|
-- This is my result of above mentioned test5530 ms GETWORDCOUNT8076 ms fnFastWordCount0.684744923229321 = 5530 / 8076-- Here is my information:SELECT SERVERPROPERTY('Edition') -- displays Developer EditionSELECT SERVERPROPERTY('ProductVersion') -- displays 9.00.1399.06 Try the test with my example of the string, please |
|
|
Igor2004
More clever than you
78 Posts |
Posted - 2006-06-08 : 17:56:00
|
May be results are different because Collations Settings are differentMy Collation Setting is Case Sensitivequote: I was just pointing out that there were others ways to go
This is small correctionquote: I was just pointing out that there were others UNCOMFORTABLES ans ERRONEOUS ways to go
Try it, pleaseselect dbo.fnFastWordCount('.....s....s.')The difference is large between'uncomfortable erroneous' and 'better faster'Regards,Igor |
|
|
Igor2004
More clever than you
78 Posts |
Posted - 2006-06-08 : 18:07:57
|
This is example very very very fast function !This is other way to go !CREATE FUNCTION dbo.fnVeryVeryVeryFastWordCount( @Text NVARCHAR(4000))RETURNS SMALLINTASBEGIN RETURN 1ENDGO But this function is erroneous as your function. |
|
|
SwePeso
Patron Saint of Lost Yaks
30421 Posts |
Posted - 2006-06-09 : 01:53:19
|
quote: Try it, pleaseselect dbo.fnFastWordCount('.....s....s.')select dbo.fnFastWordCount('.'), dbo.fnFastWordCount(' ')The difference is large between'uncomfortable erroneous' and 'better faster'Regards,Igor
I didn't know those ['.....s....s.', ',' and ' '] were real sentences.Igor, I believe you have teached me something new today!BTW, I get 757 words (in 0.0 ms), as you (in 50.0 ms), in your last example starting with select @cString = '2068,,,, 2683 9537 andSELECT SERVERPROPERTY('Edition') -- Enterprise EditionSELECT SERVERPROPERTY('ProductVersion') -- 9.00.2153.00SELECT SERVERPROPERTY('Edition') -- Developer EditionSELECT SERVERPROPERTY('ProductVersion') -- 8.00.2039SELECT SERVERPROPERTY('Edition') -- Developer EditionSELECT SERVERPROPERTY('ProductVersion') -- 10.00.208.00 |
|
|
Igor2004
More clever than you
78 Posts |
Posted - 2006-06-09 : 17:34:37
|
quote: Igor, I believe you have teached me something new today!
SurelyThis is function without errors-- Igor Nikiforov "Quality - forever" -- GETWORDCOUNT1() User-Defined Function Counts the words in a string. -- GETWORDCOUNT1(@cString[, @cDelimiters]) -- Parameters -- @cString nvarchar(4000) - Specifies the string whose words will be counted. -- @cDelimiters nvarchar(256) - Optional. Specifies one or more optional characters used to separate words in @cString. -- The default delimiters are space, ',' and '.'. Note that GETWORDCOUNT1( ) uses each of the characters in @cDelimiters as individual delimiters, not the entire string as a single delimiter. -- Return Value smallint -- Remarks GETWORDCOUNT1() by default assumes that words are delimited by spaces or ',' and '.'. If you specify another character as delimiter, this function ignores spaces ',' and '.' and uses only the specified character. -- If you use 'AAA aaa, BBB bbb, CCC ccc.' as the target string for dbo.GETWORDCOUNT1(), you can get all the following results. -- declare @cString nvarchar(4000) -- set @cString = 'AAA aaa, BBB bbb, CCC ccc.' -- select dbo.GETWORDCOUNT1(@cString, default) -- 6 - character groups, delimited by ' ' -- select dbo.GETWORDCOUNT1(@cString, ',') -- 3 - character groups, delimited by ',' -- select dbo.GETWORDCOUNT1(@cString, '.') -- 1 - character group, delimited by '.'CREATE function [dbo].[GETWORDCOUNT1] (@cString nvarchar(4000), @cDelimiters nvarchar(256) = ' ,.' )returns smallint as begin declare @k smallint select @k = 2, @cDelimiters = isnull(@cDelimiters, ' ,.') -- if no break string is specified, the function uses spaces, ',' and '.' to delimit words. if datalength(@cString) = 0 return 0 if datalength(@cDelimiters) = 0 return 1 while @k <= datalength(@cDelimiters)/2 select @cString = replace(@cString COLLATE Latin1_General_BIN, substring(@cDelimiters, @k, 1) COLLATE Latin1_General_BIN, left(@cDelimiters, 1) COLLATE Latin1_General_BIN), @k = @k + 1 while charindex(replicate(left(@cDelimiters, 1),2), @cString) > 0 select @cString = replace(@cString COLLATE Latin1_General_BIN, replicate(left(@cDelimiters, 1),2) COLLATE Latin1_General_BIN, left(@cDelimiters, 1) COLLATE Latin1_General_BIN) if ascii(left(@cDelimiters, 1)) = ascii(left(@cString, 1)) select @cString = right(@cString, datalength(@cString)/2 - 1) if ascii(left(@cDelimiters, 1)) = ascii(right(@cString, 1)) select @cString = left(@cString, datalength(@cString)/2 - 1) return case when datalength(@cString) = 0 then 0 else (datalength(@cString) - datalength(replace(@cString COLLATE Latin1_General_BIN, left(@cDelimiters, 1) COLLATE Latin1_General_BIN, '')))/2 + 1 end end Try it, pleaseselect [dbo].[GETWORDCOUNT1]('', '')select [dbo].[GETWORDCOUNT1]('1', '')select [dbo].[GETWORDCOUNT1]('', '123456789')select [dbo].[GETWORDCOUNT1]('123456789', '123456789')select [dbo].[GETWORDCOUNT1](' 1 2 3 4 5 6 7 8 9 ', '123456789')select [dbo].[GETWORDCOUNT1]('Igor Nikiforov "Quality - forever"', default) This is my traditional function -- GETWORDCOUNT() User-Defined Function Counts the words in a string. -- GETWORDCOUNT(@cString[, @cDelimiters]) -- Parameters -- @cString nvarchar(4000) - Specifies the string whose words will be counted. -- @cDelimiters nvarchar(256) - Optional. Specifies one or more optional characters used to separate words in @cString. -- The default delimiters are space, tab, carriage return, and line feed. Note that GETWORDCOUNT( ) uses each of the characters in @cDelimiters as individual delimiters, not the entire string as a single delimiter. -- Return Value smallint -- Remarks GETWORDCOUNT() by default assumes that words are delimited by spaces or tabs. If you specify another character as delimiter, this function ignores spaces and tabs and uses only the specified character. -- If you use 'AAA aaa, BBB bbb, CCC ccc.' as the target string for dbo.GETWORDCOUNT(), you can get all the following results. -- declare @cString nvarchar(4000) -- set @cString = 'AAA aaa, BBB bbb, CCC ccc.' -- select dbo.GETWORDCOUNT(@cString, default) -- 6 - character groups, delimited by ' ' -- select dbo.GETWORDCOUNT(@cString, ',') -- 3 - character groups, delimited by ',' -- select dbo.GETWORDCOUNT(@cString, '.') -- 1 - character group, delimited by '.' -- See Also GETWORDNUM(), GETALLWORDS() User-Defined Functions -- UDF the name and functionality of which correspond to the Visual FoxPro function CREATE function [dbo].[GETWORDCOUNT] (@cString nvarchar(4000), @cDelimiters nvarchar(256) )returns smallint as begin declare @k smallint, @nEndString smallint, @wordcount smallint select @k = 1, @wordcount = 0, @cDelimiters = isnull(@cDelimiters, nchar(32)+nchar(9)+nchar(10)+nchar(13)), -- if no break string is specified, the function uses spaces, tabs, carriage return and line feed to delimit words. @nEndString = 1 + datalength(@cString)/(case SQL_VARIANT_PROPERTY(@cString,'BaseType') when 'nvarchar' then 2 else 1 end) -- for unicode while charindex(substring(@cString, @k, 1) COLLATE Latin1_General_BIN, @cDelimiters COLLATE Latin1_General_BIN) > 0 and @nEndString > @k -- skip opening break characters, if any set @k = @k + 1 if @k < @nEndString begin set @wordcount = 1 -- count the one we are in now count transitions from 'not in word' to 'in word' -- if the current character is a break char, but the next one is not, we have entered a new word while @k < @nEndString begin if @k +1 < @nEndString and charindex(substring(@cString, @k, 1) COLLATE Latin1_General_BIN, @cDelimiters COLLATE Latin1_General_BIN) > 0 and charindex(substring(@cString, @k+1, 1) COLLATE Latin1_General_BIN, @cDelimiters COLLATE Latin1_General_BIN) = 0 select @wordcount = @wordcount + 1, @k = @k + 1 -- Skip over the first character in the word. We know it cannot be a break character. set @k = @k + 1 end end return @wordcount end |
|
|
SwePeso
Patron Saint of Lost Yaks
30421 Posts |
Posted - 2006-06-12 : 05:04:57
|
Yes, I tried you function without errors, again, as you asked me to.It still confuses me that hyphen is a word in your example (I get 5 as output result).select [dbo].[GETWORDCOUNT1]('Igor Nikiforov "Quality - forever"', default) If I change select [dbo].[GETWORDCOUNT1]('Igor Nikiforov "Quality - forever"', default) to select [dbo].[GETWORDCOUNT1]('Igor Nikiforov "Quality - forever"', '-,. ') I get 4 words as I would expect. That's ok! I understand the difference.But then I have a problem with select [dbo].[GETWORDCOUNT1]('Peter Larsson is very, very annoying with so-called comments.', '-,. ') Your function returns 10, where I would expect 9.With this select, I would like to have 14 as returned outputselect [dbo].[GETWORDCOUNT1]('How many alpha-numeric characters are there in the english alphabet - 26, 52 or 91?', '-,. ') not 15 as the current output tells me.But maybe that is a feature in your function without errors?Perhaps it would be possible to check for imbedded hyphens? If surrounded by non-delimiter characters, don't treat hyphen as delimiter, even if selected as one?Peter LarssonHelsingborg, Sweden |
|
|
Igor2004
More clever than you
78 Posts |
Posted - 2006-06-12 : 10:57:11
|
Hi, Peterselect [dbo].[GETWORDCOUNT1]('Peter Larsson is very, very annoying with so-called comments.', '-,. ') -- displau 10 quote: Your function returns 10, where I would expect 9.
1 Peter 2 Larsson 3 is4 very5 very6 annoying7 with8 so9 called10 commentsSee, attentively, the delimiters, pleaseBTW1) Do you speak French ?2) Would you like help me. I need the translation 18 phrases to Swedish language ?Igor.http://udfunctions.blogspot.com/ |
|
|
Igor2004
More clever than you
78 Posts |
Posted - 2006-06-12 : 11:02:10
|
quote: Perhaps it would be possible to check for imbedded hyphens? If surrounded by non-delimiter characters, don't treat hyphen as delimiter, even if selected as one?
Certainly, butthis is a problem is more complex.Igor.http://udfunctions.blogspot.com/ |
|
|
SwePeso
Patron Saint of Lost Yaks
30421 Posts |
Posted - 2006-06-12 : 12:04:50
|
quote: BTW1) Do you speak French ?2) Would you like help me. I need the translation 18 phrases to Swedish language ?Igor.
1) Oui, je parle francais en peu.2) Glad to help.Peter LarssonHelsingborg, Sweden |
|
|
Igor2004
More clever than you
78 Posts |
Posted - 2006-06-12 : 22:02:10
|
quote: Perhaps it would be possible to check for imbedded hyphens? If surrounded by non-delimiter characters, don't treat hyphen as delimiter, even if selected as one?
This is function, without errors :-)CREATE function [dbo].[GETWORDCOUNT3] (@cString nvarchar(4000), @cDelimiters1 nvarchar(256) = ' ,.', @cDelimiters2 nvarchar(256) = '-')returns smallint as begin declare @wordcount smallint select @cDelimiters1 = isnull(@cDelimiters1, ' ,.'), -- if no break string1 is specified, the function uses spaces, '.' and ',' to delimit words. @cDelimiters2 = isnull(@cDelimiters2, '-') -- if no break string2 is specified, the function uses '-' to delimit words. select @wordcount = count(*) from dbo.GETALLWORDS_AUXILIARY(@cString, @cDelimiters1+@cDelimiters2 ) where WORDNUM = 1 or not(datalength(DELIMITERS)/2 = 1 and charindex(DELIMITERS, @cDelimiters2) > 0) return @wordcount end -- GETALLWORDS_AUXILIARY() User-Defined Function Inserts the words from a string into the table. -- GETALLWORDS_AUXILIARY(@cString[, @cDelimiters]) -- Parameters -- @cString nvarchar(4000) - Specifies the string whose words will be inserted into the table @GETALLWORDS. -- @cDelimiters nvarchar(256) - Optional. Specifies one or more optional characters used to separate words in @cString. -- The default delimiters are space, tab, carriage return, and line feed. Note that GETALLWORDS_AUXILIARY( ) uses each of the characters in @cDelimiters as individual delimiters, not the entire string as a single delimiter. -- Return Value table -- Remarks GETALLWORDS_AUXILIARY() by default assumes that words are delimited by spaces or tabs. If you specify another character as delimiter, this function ignores spaces and tabs and uses only the specified character. -- Example -- declare @cString nvarchar(4000) -- set @cString = 'The default delimiters are space, tab, carriage return, and line feed. If you specify another character as delimiter, this function ignores spaces and tabs and uses only the specified character.' -- select * from dbo.GETALLWORDS_AUXILIARY(@cString, default) -- select * from dbo.GETALLWORDS_AUXILIARY(@cString, ' ,.') -- See Also GETWORDNUM() , GETWORDCOUNT() User-Defined Functions ALTER function [dbo].[GETALLWORDS_AUXILIARY] (@cString nvarchar(4000), @cDelimiters nvarchar(256))returns @GETALLWORDS table (WORDNUM smallint, WORD nvarchar(4000), STARTOFWORD smallint, LENGTHOFWORD smallint, DELIMITERS nvarchar(4000)) begin declare @k smallint, @wordcount smallint, @nEndString smallint, @BegOfWord smallint, @flag bit, @EndPreviousWord smallint, @Delimiters nvarchar(4000) select @k = 1, @wordcount = 1, @BegOfWord = 1, @flag = 0, @cString = isnull(@cString, ''), @Delimiters = '', @cDelimiters = isnull(@cDelimiters, nchar(32)+nchar(9)+nchar(10)+nchar(13)), -- if no break string is specified, the function uses spaces, tabs, carriage return and line feed to delimit words. @nEndString = 1 + datalength(@cString) /(case SQL_VARIANT_PROPERTY(@cString,'BaseType') when 'nvarchar' then 2 else 1 end) -- for unicode while 1 > 0 begin if @k - @BegOfWord > 0 begin if @wordcount > 1 begin select @EndPreviousWord = STARTOFWORD + LENGTHOFWORD from @GETALLWORDS where WORDNUM = @wordcount - 1 select @Delimiters = substring(@cString, @EndPreviousWord, @BegOfWord - @EndPreviousWord) end else if @BegOfWord > 1 select @Delimiters = substring(@cString, 1, @BegOfWord -1) insert into @GETALLWORDS (WORDNUM, WORD, STARTOFWORD, LENGTHOFWORD, DELIMITERS) values( @wordcount, substring(@cString, @BegOfWord, @k-@BegOfWord), @BegOfWord, @k-@BegOfWord, @Delimiters) -- previous word select @wordcount = @wordcount + 1, @BegOfWord = @k end if @flag = 1 break while charindex(substring(@cString, @k, 1) COLLATE Latin1_General_BIN, @cDelimiters COLLATE Latin1_General_BIN) > 0 and @nEndString > @k -- skip break characters, if any select @k = @k + 1, @BegOfWord = @BegOfWord + 1 while charindex(substring(@cString, @k, 1) COLLATE Latin1_General_BIN, @cDelimiters COLLATE Latin1_General_BIN) = 0 and @nEndString > @k -- skip the character in the word select @k = @k + 1 if @k >= @nEndString select @flag = 1 end return end select [dbo].[GETWORDCOUNT3]('Peter Larsson - is very, very annoying with so-called comments.', default, default) -- display 9 http://udfunctions.blogspot.com/ |
|
|
X002548
Not Just a Number
15586 Posts |
|
Next Page
|
|
|
|
|