1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
| function word_indices = processEmail(email_contents) %PROCESSEMAIL preprocesses a the body of an email and %returns a list of word_indices % word_indices = PROCESSEMAIL(email_contents) preprocesses % the body of an email and returns a list of indices of the % words contained in the email. %
% Load Vocabulary vocabList = getVocabList();
% Init return value word_indices = [];
% ========================== Preprocess Email ===========================
% Find the Headers ( \n\n and remove ) % Uncomment the following lines if you are working with raw emails with the % full headers
% hdrstart = strfind(email_contents, ([char(10) char(10)])); % email_contents = email_contents(hdrstart(1):end);
% Lower case email_contents = lower(email_contents);
% Strip all HTML % Looks for any expression that starts with < and ends with > and replace % and does not have any < or > in the tag it with a space email_contents = regexprep(email_contents, '<[^<>]+>', ' ');
% Handle Numbers % Look for one or more characters between 0-9 email_contents = regexprep(email_contents, '[0-9]+', 'number');
% Handle URLS % Look for strings starting with http:// or https:// email_contents = regexprep(email_contents, ... '(http|https)://[^\s]*', 'httpaddr');
% Handle Email Addresses % Look for strings with @ in the middle email_contents = regexprep(email_contents, '[^\s]+@[^\s]+', 'emailaddr');
% Handle $ sign email_contents = regexprep(email_contents, '[$]+', 'dollar');
% ========================== Tokenize Email ===========================
% Output the email to screen as well fprintf('\n==== Processed Email ====\n\n');
% Process file l = 0;
while ~isempty(email_contents)
% Tokenize and also get rid of any punctuation [str, email_contents] = ... strtok(email_contents, ... [' @$/#.-:&*+=[]?!(){},''">_<;%' char(10) char(13)]); % Remove any non alphanumeric characters str = regexprep(str, '[^a-zA-Z0-9]', '');
% Stem the word % (the porterStemmer sometimes has issues, so we use a try catch block) try str = porterStemmer(strtrim(str)); catch str = ''; continue; end;
% Skip the word if it is too short if length(str) < 1 continue; end
% Look up the word in the dictionary and add to word_indices if % found % ====================== YOUR CODE HERE ====================== % Instructions: Fill in this function to add the index of str to % word_indices if it is in the vocabulary. At this point % of the code, you have a stemmed word from the email in % the variable str. You should look up str in the % vocabulary list (vocabList). If a match exists, you % should add the index of the word to the word_indices % vector. Concretely, if str = 'action', then you should % look up the vocabulary list to find where in vocabList % 'action' appears. For example, if vocabList{18} = % 'action', then, you should add 18 to the word_indices % vector (e.g., word_indices = [word_indices ; 18]; ). % % Note: vocabList{idx} returns a the word with index idx in the % vocabulary list. % % Note: You can use strcmp(str1, str2) to compare two strings (str1 and % str2). It will return 1 only if the two strings are equivalent. %
for i=1:length(vocabList) if strcmp(vocabList{i}, str) word_indices = [word_indices; i]; break; end end
% =============================================================
% Print to screen, ensuring that the output lines are not too long if (l + length(str) + 1) > 78 fprintf('\n'); l = 0; end fprintf('%s ', str); l = l + length(str) + 1;
end
% Print footer fprintf('\n\n=========================\n');
end
|